PyPI - optimum-rbln - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -21,103 +21,42 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING
-import torch
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-)
-from ...models.decoderonly import (
+from ...models.decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
-    DecoderOnlyDecoderLayer,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
     DecoderOnlyWrapper,
-    slice_and_unsqueeze_cos_sin,
 )
-class GemmaWrapper(DecoderOnlyWrapper):
-    def get_forward_dict(self):
-        forward_dict = {}
-        forward_dict.update(
-            {
-                "wrapper": GemmaModel.forward,
-                "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DecoderOnlyAttention.forward,
-            }
-        )
-        return forward_dict
-class GemmaModel:
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = True,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-        rotary_pos_emb=None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        # embed positions
-        inputs_embeds = self.embed_tokens(input_ids)
-        hidden_states = inputs_embeds
-        ##### GEMMA change from llama#####
-        hidden_states = hidden_states * (self.config.hidden_size**0.5)
-        attention_mask = (1 - attention_mask) * torch.finfo(torch.float16).min
-        # get cos,sin vector
-        cos, sin = rotary_pos_emb(inputs_embeds, attention_mask.shape[-1])
-        cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
+if TYPE_CHECKING:
+    from transformers import GemmaForCausalLM
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            layer_outputs = forward_dict["model"](
-                decoder_layer,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                batch_ids=batch_ids,
-                cos=cos,
-                sin=sin,
-                forward_dict=forward_dict,
-            )
-            hidden_states = layer_outputs[0]
-            updated_cache = layer_outputs[2 if output_attentions else 1]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
+class GemmaWrapper(DecoderOnlyWrapper):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GemmaForCausalLM"):
+        new_layers = []
+        for layer in causal_lm.model.layers:
+            if self.attn_impl == "eager":
+                new_self_attn = DecoderOnlyAttention(layer.self_attn)
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = DecoderOnlyFlashAttention(
+                    layer.self_attn, kvcache_partition_len=self.kvcache_partition_len
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = DecoderOnlyLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = GemmaModel(causal_lm.model, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class GemmaModel(DecoderOnlyModel):
+    @property
+    def hidden_multiplier(self):
+        return self._original_mod.config.hidden_size**0.5

optimum/rbln/transformers/models/gemma/modeling_gemma.py CHANGED Viewed

@@ -21,28 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import GemmaForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gemma_architecture import GemmaWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNGemmaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Gemma Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based GemmaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers GemmaForCausalLM model into a RBLN transformer model by:
@@ -50,18 +40,4 @@ class RBLNGemmaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return GemmaWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(GemmaForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = GemmaWrapper

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -21,262 +21,74 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from ...cache_utils import RebelDynamicCache_4D
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+)
-class GPT2LMHeadModelWrapper(torch.nn.Module):
-    def __init__(self, model, max_seq_len):
-        super().__init__()
-        self.model = model.transformer
-        self.lm_head = model.lm_head
-        self.config = model.config
-        self.max_seq_len = max_seq_len
-        self.forward_dict = self.get_forward_dict()
+if TYPE_CHECKING:
+    from transformers import GPT2LMHeadModel
-    def get_forward_dict(self):
-        forward_dict = {
-            "wrapper": _GPT2Model.forward,
-            "model": _GPT2Block.forward,
-            "decoder_layer": _GPT2Attention.forward,
-        }
-        return forward_dict
-    def forward(
-        self,
-        input_ids,
-        attention_mask,
-        cache_position,
-        batch_position,
-        query_idx,
-        *past_key_values,
-    ):
-        if input_ids.shape[1] == 1:
-            rbln_batch_position = None
-        else:
-            rbln_batch_position = batch_position
+class GPT2Wrapper(DecoderOnlyWrapper):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GPT2LMHeadModel"):
+        if self.attn_impl != "eager":
+            raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            new_self_attn = GPT2Attention(layer.attn)
+            new_layer = GPT2Layer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = GPT2Model(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
-        # Formatting list of past_kv to DynamicCache class.
-        past_key_value = RebelDynamicCache_4D.from_input_format(
-            cache_position,
-            self.config.n_layer,
-            *past_key_values,
-        )
-        outputs = self.forward_dict["wrapper"](
-            self.model,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=cache_position,
-            past_key_value=past_key_value,
-            batch_ids=rbln_batch_position,
-            forward_dict=self.forward_dict,
-            # rotary_emb  differenct from_llama
-        )
-        hidden_states = outputs[0]
-        if batch_position >= 0:
-            hidden_states = hidden_states[:, query_idx].unsqueeze(1)
-        logits = self.lm_head(hidden_states)
-        output = (logits,) + outputs[1:]
-        return output, batch_position + query_idx
+class GPT2Model(DecoderOnlyModel):
+    mask_fmin = torch.finfo(torch.float32).min
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_f
-class _GPT2Model:
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-    ) -> BaseModelOutputWithPast:
-        b_size, q_len = input_ids.shape
-        inputs_embeds = self.wte(input_ids)
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
-        if position_ids.shape[0] > 1:
-            position_embeds = []
-            for b_idx in range(b_size):
-                position_embed = self.wpe(position_ids[b_idx])
-                # position_embed = position_embed.dtype(inputs_embeds.dtype)
-                position_embeds.append(position_embed)
+    def get_pos_embedding(self) -> nn.Embedding:
+        return self._original_mod.wpe
-            position_embeds = torch.cat(position_embeds, dim=0).unsqueeze(1)
-        else:
-            position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
+class GPT2Layer(DecoderOnlyLayer):
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_1
-        # GPT2Attention mask.
-        # Here we assume mask is causal mask, (batch, 1, query_length, key_length + query_length)
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_2
-        for layer_idx, block in enumerate(self.h):
-            hidden_states, updated_cache = forward_dict["model"](
-                block,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                past_key_value=past_key_value,
-                position_ids=position_ids,
-                batch_ids=batch_ids,
-                forward_dict=forward_dict,
-            )
-        hidden_states = self.ln_f(hidden_states)
-        output_shape = (-1,) + (q_len,) + (hidden_states.size(-1),)
-        hidden_states = hidden_states.view(output_shape)
-        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-        )
+class GPT2Attention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.c_attn = self._original_mod.c_attn
+        self.o_proj = self._original_mod.c_proj
+        self.split_size = self._original_mod.split_size
+        self.num_key_value_heads = self._original_mod.num_heads
-class _GPT2Block:
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_idx: int,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, RebelDynamicCache_4D]:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
+    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        return query_states, key_states, value_states
-        hidden_states, k, v = forward_dict["decoder_layer"](
-            self.attn,
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            batch_index=batch_ids,
+    def rbln_attention(self, *args, **kwargs):
+        return super().rbln_attention(
+            *args,
+            **kwargs,
+            layer_idx=self.layer_idx,
+            scale_attn_by_inverse_layer_idx=self._original_mod.scale_attn_by_inverse_layer_idx,
         )
-        past_key_value.assign(k, v, layer_idx)
-        # residual connection
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states, past_key_value
-class _GPT2Attention:
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-        # -------------------
-        # Below are deleted since "where" op does not supported on RBLN graph.
-        # -------------------
-        # if not self.is_cross_attention:
-        #     # if only "normal" attention layer implements causal mask
-        #     query_length, key_length = query.size(-2), key.size(-2)
-        #     causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-        #     mask_value = torch.finfo(attn_weights.dtype).min
-        #     # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-        #     # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        #     mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
-        #     attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
-        # Apply the attention mask
-        attn_weights.view(
-            -1,
-        )
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_index: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        bsz, q_len, _ = hidden_states.size()
-        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        querys = self._split_heads(query, self.num_heads, self.head_dim)  # (batch, head, seq_length, head_features)
-        keys = self._split_heads(key, self.num_heads, self.head_dim)
-        values = self._split_heads(value, self.num_heads, self.head_dim)
-        # Decoder
-        if (batch_index is None or batch_index == -1) and bsz > 1:
-            all_keys = []
-            all_values = []
-            all_attn_output = []
-            for b in range(bsz):
-                query = querys[b].unsqueeze(0)
-                attn_mask = attention_mask[b].unsqueeze(0)
-                key = keys[b].unsqueeze(0)
-                value = values[b].unsqueeze(0)
-                key, value = past_key_value.update(
-                    key,
-                    value,
-                    self.layer_idx,
-                    b,
-                )
-                attn_output, _ = _GPT2Attention._attn(self, query, key, value, attn_mask)
-                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-                all_keys.append(key)
-                all_values.append(value)
-                all_attn_output.append(attn_output)
-            keys = torch.cat(all_keys, dim=0)
-            values = torch.cat(all_values, dim=0)
-            attn_output = torch.cat(all_attn_output, dim=0)
-        # Prefill
-        else:
-            if batch_index is None or batch_index == -1:
-                batch_index = 0
-            keys, values = past_key_value.update(
-                keys,
-                values,
-                self.layer_idx,
-                batch_index,
-                read_first_step=True,
-            )
-            attn_output, _ = _GPT2Attention._attn(self, querys, keys, values, attention_mask)
-            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        return attn_output, keys, values

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -21,20 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import GPT2LMHeadModel
-from ....modeling_config import RBLNConfig
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .gpt2_architecture import GPT2LMHeadModelWrapper
+from .gpt2_architecture import GPT2Wrapper  # GPT2LMHeadModelWrapper
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+logger = logging.get_logger(__name__)
 class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
@@ -42,7 +34,7 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the
     library implements for all its model.
     It implements the methods to convert a pre-trained transformers GPT2 model into a RBLN transformer model by:
@@ -51,22 +43,5 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return GPT2LMHeadModelWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-GPT2.
-        Returns:
-            Any: GPT2's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(GPT2LMHeadModel, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = GPT2Wrapper
+    _use_rotary_emb = False

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -21,28 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import LlamaForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .llama_architecture import LlamaWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -50,18 +40,4 @@ class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return LlamaWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(LlamaForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = LlamaWrapper

optimum-rbln 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl