PyPI - optimum-rbln - Versions diffs - 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

optimum-rbln 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -20,62 +20,77 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import torch
+from typing import TYPE_CHECKING
+import torch.nn as nn
 from ....utils import logging
-from ...models.decoderonly import (
+from ...models.decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
-    DecoderOnlyDecoderLayer,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
     DecoderOnlyModel,
     DecoderOnlyWrapper,
-    RotaryEmbedding,
 )
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel as ExaoneForCausalLM
 logger = logging.get_logger(__name__)
 class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     """A wrapper class for the Exaone model with a language modeling head."""
-    def __init__(self, model, max_seq_len, kvcache_partition_len=None):
-        super(DecoderOnlyWrapper, self).__init__()
-        self.config = model.config
-        self.model = self.convert_attribute_name(model.transformer)
-        self.lm_head = model.lm_head
-        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
-        if kvcache_partition_len is not None:
-            # WORKAROUND : for passing partition length as a value to the rbln compiler.
-            # What is actually used is the shape of this tensor.
-            self.kvcache_partition_size = torch.zeros(kvcache_partition_len, dtype=torch.int32)
-            self.attn_implementation = "flash_attn_rbln"
-            logger.info(f"Using rbln-flash-attention. (partition length : {kvcache_partition_len})")
-        else:
-            self.kvcache_partition_size = None
-            self.attn_implementation = "eager"
-    @staticmethod
-    def convert_attribute_name(model):
-        model.embed_tokens = model.wte
-        model.norm = model.ln_f
-        model.layers = model.h
-        for layer in model.layers:
-            layer.input_layernorm = layer.ln_1
-            layer.self_attn = layer.attn.attention
-            layer.post_attention_layernorm = layer.ln_2
-            layer.self_attn.o_proj = layer.self_attn.out_proj
-        return model
-    def get_forward_dict(self):
-        forward_dict = {}
-        forward_dict.update(
-            {
-                "wrapper": DecoderOnlyModel.forward,
-                "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DecoderOnlyAttention.forward,
-            }
-        )
-        return forward_dict
+    def convert_to_rbln_causal_lm(self, causal_lm: "ExaoneForCausalLM"):
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            if self.attn_impl == "eager":
+                new_self_attn = ExaoneAttention(layer.attn.attention)
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = ExaoneFlashAttention(
+                    layer.attn.attention, kvcache_partition_len=self.kvcache_partition_len
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = ExaoneLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = ExaoneModel(causal_lm.transformer, new_layers, partition_len=self.kvcache_partition_len)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class ExaoneModel(DecoderOnlyModel):
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_f
+class ExaoneLayer(DecoderOnlyLayer):
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_1
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_2
+class ExaoneAttention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.o_proj = self._original_mod.out_proj
+class ExaoneFlashAttention(DecoderOnlyFlashAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.o_proj = self._original_mod.out_proj

optimum/rbln/transformers/models/exaone/modeling_exaone.py CHANGED Viewed

@@ -21,10 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from transformers import AutoModelForCausalLM
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .exaone_architecture import ExaoneForCausalLMWrapper
-from .hf_hub_cached.modeling_exaone import ExaoneForCausalLM
 logger = logging.get_logger(__name__)
@@ -45,7 +47,7 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     _decoder_wrapper_cls = ExaoneForCausalLMWrapper
-    _original_cls = ExaoneForCausalLM
+    _hf_class = AutoModelForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -21,113 +21,42 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Dict, List, Optional, Tuple, Union
-import torch
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-)
-from ...models.decoderonly import (
-    DecoderOnlyDecoderLayer,
+from typing import TYPE_CHECKING
+from ...models.decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
     DecoderOnlyWrapper,
-    slice_and_unsqueeze_cos_sin,
 )
-from ...models.decoderonly.decoderonly_architecture import DECODERONLY_ATTENTION_CLASSES
-class GemmaWrapper(DecoderOnlyWrapper):
-    def get_forward_dict(self):
-        forward_dict = {}
-        forward_dict.update(
-            {
-                "wrapper": GemmaModel.forward,
-                "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DECODERONLY_ATTENTION_CLASSES[self.attn_implementation].forward,
-            }
-        )
-        return forward_dict
-class GemmaModel:
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = True,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        cache_pos_for_partitions: Optional[torch.Tensor] = None,
-        kvcache_partition_size: Optional[torch.Tensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-        rotary_pos_emb=None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        # retrieve input_ids and inputs_embeds
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
-        # embed positions
-        inputs_embeds = self.embed_tokens(input_ids)
-        hidden_states = inputs_embeds
-        ##### GEMMA change from llama#####
-        hidden_states = hidden_states * (self.config.hidden_size**0.5)
-        attention_mask = (1 - attention_mask) * torch.finfo(torch.float16).min
+if TYPE_CHECKING:
+    from transformers import GemmaForCausalLM
-        # get cos,sin vector
-        cos, sin = rotary_pos_emb(inputs_embeds, attention_mask.shape[-1])
-        cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            layer_outputs = forward_dict["model"](
-                decoder_layer,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                batch_ids=batch_ids,
-                cos=cos,
-                sin=sin,
-                cache_pos_for_partitions=cache_pos_for_partitions,
-                kvcache_partition_size=kvcache_partition_size,
-                forward_dict=forward_dict,
-            )
-            hidden_states = layer_outputs[0]
-            updated_cache = layer_outputs[2 if output_attentions else 1]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
+class GemmaWrapper(DecoderOnlyWrapper):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GemmaForCausalLM"):
+        new_layers = []
+        for layer in causal_lm.model.layers:
+            if self.attn_impl == "eager":
+                new_self_attn = DecoderOnlyAttention(layer.self_attn)
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = DecoderOnlyFlashAttention(
+                    layer.self_attn, kvcache_partition_len=self.kvcache_partition_len
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = DecoderOnlyLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = GemmaModel(causal_lm.model, new_layers, partition_len=self.kvcache_partition_len)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class GemmaModel(DecoderOnlyModel):
+    @property
+    def hidden_multiplier(self):
+        return self._original_mod.config.hidden_size**0.5

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -21,262 +21,74 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Dict, Optional, Tuple, Union
+import math
+from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from ...cache_utils import RebelDynamicCache_4D
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+)
-class GPT2LMHeadModelWrapper(torch.nn.Module):
-    def __init__(self, model, max_seq_len):
-        super().__init__()
-        self.model = model.transformer
-        self.lm_head = model.lm_head
-        self.config = model.config
-        self.max_seq_len = max_seq_len
-        self.forward_dict = self.get_forward_dict()
+if TYPE_CHECKING:
+    from transformers import GPT2LMHeadModel
-    def get_forward_dict(self):
-        forward_dict = {
-            "wrapper": _GPT2Model.forward,
-            "model": _GPT2Block.forward,
-            "decoder_layer": _GPT2Attention.forward,
-        }
-        return forward_dict
-    def forward(
-        self,
-        input_ids,
-        attention_mask,
-        cache_position,
-        batch_position,
-        query_idx,
-        *past_key_values,
-    ):
-        if input_ids.shape[1] == 1:
-            rbln_batch_position = None
-        else:
-            rbln_batch_position = batch_position
+class GPT2Wrapper(DecoderOnlyWrapper):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GPT2LMHeadModel"):
+        if self.attn_impl != "eager":
+            raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            new_self_attn = GPT2Attention(layer.attn)
+            new_layer = GPT2Layer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = GPT2Model(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
-        # Formatting list of past_kv to DynamicCache class.
-        past_key_value = RebelDynamicCache_4D.from_input_format(
-            cache_position,
-            self.config.n_layer,
-            *past_key_values,
-        )
-        outputs = self.forward_dict["wrapper"](
-            self.model,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=cache_position,
-            past_key_value=past_key_value,
-            batch_ids=rbln_batch_position,
-            forward_dict=self.forward_dict,
-            # rotary_emb  differenct from_llama
-        )
+class GPT2Model(DecoderOnlyModel):
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_f
-        hidden_states = outputs[0]
-        if batch_position >= 0:
-            hidden_states = hidden_states[:, query_idx].unsqueeze(1)
-        logits = self.lm_head(hidden_states)
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
-        output = (logits,) + outputs[1:]
+    def get_pos_embedding(self) -> nn.Embedding:
+        return self._original_mod.wpe
-        return output, batch_position + query_idx
+class GPT2Layer(DecoderOnlyLayer):
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_1
-class _GPT2Model:
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-    ) -> BaseModelOutputWithPast:
-        b_size, q_len = input_ids.shape
-        inputs_embeds = self.wte(input_ids)
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_2
-        if position_ids.shape[0] > 1:
-            position_embeds = []
-            for b_idx in range(b_size):
-                position_embed = self.wpe(position_ids[b_idx])
-                # position_embed = position_embed.dtype(inputs_embeds.dtype)
-                position_embeds.append(position_embed)
-            position_embeds = torch.cat(position_embeds, dim=0).unsqueeze(1)
-        else:
-            position_embeds = self.wpe(position_ids)
+class GPT2Attention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.c_attn = self._original_mod.c_attn
+        self.o_proj = self._original_mod.c_proj
+        self.split_size = self._original_mod.split_size
-        hidden_states = inputs_embeds + position_embeds
+    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        return query_states, key_states, value_states
-        # GPT2Attention mask.
-        # Here we assume mask is causal mask, (batch, 1, query_length, key_length + query_length)
-        attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+    def get_attn_scale(self):
+        scale = 1.0
+        if self._original_mod.scale_attn_weights:
+            scale /= math.sqrt(self.head_dim)
-        for layer_idx, block in enumerate(self.h):
-            hidden_states, updated_cache = forward_dict["model"](
-                block,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                past_key_value=past_key_value,
-                position_ids=position_ids,
-                batch_ids=batch_ids,
-                forward_dict=forward_dict,
-            )
+        if self._original_mod.scale_attn_by_inverse_layer_idx:
+            scale /= 1 + self.layer_idx
-        hidden_states = self.ln_f(hidden_states)
-        output_shape = (-1,) + (q_len,) + (hidden_states.size(-1),)
-        hidden_states = hidden_states.view(output_shape)
-        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-        )
-class _GPT2Block:
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_idx: int,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, RebelDynamicCache_4D]:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        hidden_states, k, v = forward_dict["decoder_layer"](
-            self.attn,
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            batch_index=batch_ids,
-        )
-        past_key_value.assign(k, v, layer_idx)
-        # residual connection
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-        return hidden_states, past_key_value
-class _GPT2Attention:
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-        # -------------------
-        # Below are deleted since "where" op does not supported on RBLN graph.
-        # -------------------
-        # if not self.is_cross_attention:
-        #     # if only "normal" attention layer implements causal mask
-        #     query_length, key_length = query.size(-2), key.size(-2)
-        #     causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
-        #     mask_value = torch.finfo(attn_weights.dtype).min
-        #     # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-        #     # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        #     mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
-        #     attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
-        # Apply the attention mask
-        attn_weights.view(
-            -1,
-        )
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_index: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        bsz, q_len, _ = hidden_states.size()
-        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        querys = self._split_heads(query, self.num_heads, self.head_dim)  # (batch, head, seq_length, head_features)
-        keys = self._split_heads(key, self.num_heads, self.head_dim)
-        values = self._split_heads(value, self.num_heads, self.head_dim)
-        # Decoder
-        if (batch_index is None or batch_index == -1) and bsz > 1:
-            all_keys = []
-            all_values = []
-            all_attn_output = []
-            for b in range(bsz):
-                query = querys[b].unsqueeze(0)
-                attn_mask = attention_mask[b].unsqueeze(0)
-                key = keys[b].unsqueeze(0)
-                value = values[b].unsqueeze(0)
-                key, value = past_key_value.update(
-                    key,
-                    value,
-                    self.layer_idx,
-                    b,
-                )
-                attn_output, _ = _GPT2Attention._attn(self, query, key, value, attn_mask)
-                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-                all_keys.append(key)
-                all_values.append(value)
-                all_attn_output.append(attn_output)
-            keys = torch.cat(all_keys, dim=0)
-            values = torch.cat(all_values, dim=0)
-            attn_output = torch.cat(all_attn_output, dim=0)
-        # Prefill
-        else:
-            if batch_index is None or batch_index == -1:
-                batch_index = 0
-            keys, values = past_key_value.update(
-                keys,
-                values,
-                self.layer_idx,
-                batch_index,
-                read_first_step=True,
-            )
-            attn_output, _ = _GPT2Attention._attn(self, querys, keys, values, attention_mask)
-            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        return attn_output, keys, values
+        return scale

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -23,7 +23,7 @@
 from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .gpt2_architecture import GPT2LMHeadModelWrapper
+from .gpt2_architecture import GPT2Wrapper
 logger = logging.get_logger(__name__)
@@ -43,4 +43,5 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    _decoder_wrapper_cls = GPT2LMHeadModelWrapper
+    _decoder_wrapper_cls = GPT2Wrapper
+    _use_rotary_emb = False

optimum/rbln/transformers/models/llama/llama_architecture.py CHANGED Viewed

@@ -21,7 +21,6 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
 from ...models.decoderonly.decoderonly_architecture import DecoderOnlyWrapper

optimum-rbln 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

optimum-rbln 0.1.13py3-none-any.whl → 0.2.0py3-none-any.whl