PyPI - optimum-rbln - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

optimum-rbln 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

optimum/rbln/__init__.py CHANGED Viewed

@@ -51,6 +51,7 @@ _import_structure = {
         "RBLNGPT2LMHeadModel",
         "RBLNWav2Vec2ForCTC",
         "RBLNLlamaForCausalLM",
+        "RBLNMidmLMHeadModel",
         "RBLNWhisperForConditionalGeneration",
     ],
     "diffusers": [
@@ -107,6 +108,7 @@ if TYPE_CHECKING:
         RBLNCLIPTextModelWithProjection,
         RBLNGPT2LMHeadModel,
         RBLNLlamaForCausalLM,
+        RBLNMidmLMHeadModel,
         RBLNWav2Vec2ForCTC,
         RBLNWhisperForConditionalGeneration,
     )

optimum/rbln/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = '0.1.1'
1	+ __version__ = '0.1.4'

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -99,7 +99,7 @@ class RBLNBaseModel(OptimizedModel, ABC):
     model_type = "rbln_model"
     auto_model_class = AutoModel  # feature extraction
-    config_name = "model_index.json"
+    config_name = "config.json"
     def __init__(
         self,
@@ -490,7 +490,7 @@ class RBLNModel(RBLNBaseModel):
         preprocessors = maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
         # Get compilation arguments
-        if rbln_config_kwargs.get("rbln_config", None) is None:
+        if (rbln_config := rbln_config_kwargs.pop("rbln_config", None)) is None:
             rbln_config = cls.get_rbln_config(preprocessors=preprocessors, model_config=config, **rbln_config_kwargs)
         rbln_runtime_configs = list(rbln_config.values())
@@ -595,7 +595,7 @@ class RBLNModelForImageClassification(RBLNModel):
                     rbln_image_size = processor.size["shortest_edge"]
                     break
             if rbln_image_size is None:
-                raise ValueError("`rbln_rbln_image_size` should be specified!")
+                raise ValueError("`rbln_image_size` should be specified!")
         if rbln_batch_size is None:
             rbln_batch_size = 1

optimum/rbln/transformers/__init__.py CHANGED Viewed

@@ -35,6 +35,7 @@ _import_structure = {
         "RBLNWav2Vec2ForCTC",
         "RBLNWhisperForConditionalGeneration",
         "RBLNLlamaForCausalLM",
+        "RBLNMidmLMHeadModel",
     ],
 }
@@ -45,6 +46,7 @@ if TYPE_CHECKING:
         RBLNCLIPTextModelWithProjection,
         RBLNGPT2LMHeadModel,
         RBLNLlamaForCausalLM,
+        RBLNMidmLMHeadModel,
         RBLNWav2Vec2ForCTC,
         RBLNWhisperForConditionalGeneration,
     )

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -24,5 +24,6 @@
 from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection
 from .gpt2 import RBLNGPT2LMHeadModel
 from .llama import RBLNLlamaForCausalLM
+from .midm import RBLNMidmLMHeadModel
 from .wav2vec2 import RBLNWav2Vec2ForCTC
 from .whisper import RBLNWhisperForConditionalGeneration

optimum/rbln/transformers/models/llama/llama_architecture.py CHANGED Viewed

@@ -36,7 +36,6 @@ from transformers.models.llama.modeling_llama import (
     LlamaForCausalLM,
     LlamaModel,
     LlamaRotaryEmbedding,
-    repeat_kv,
 )
@@ -149,26 +148,41 @@ class _LlamaAttention(LlamaAttention):
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # change to remove repeat
+        key_states = key_states.unsqueeze(2)
+        value_states = value_states.unsqueeze(2)
+        query_states = query_states.view(
+            bsz, self.num_key_value_heads, self.num_heads // self.num_key_value_heads, q_len, self.head_dim
+        )
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # change to remove repeat
+        # key_states = repeat_kv(key_states, self.num_key_value_groups)
+        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
+        attn_weights = torch.matmul(query_states, key_states.transpose(3, 4)) / math.sqrt(self.head_dim)
+        # change to remove repeat
+        # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        #     raise ValueError(
+        #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+        #         f" {attn_weights.size()}"
+        #     )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
+            else:
+                # change to remove repeat
+                attention_mask = attention_mask.unsqueeze(2)
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
@@ -176,6 +190,9 @@ class _LlamaAttention(LlamaAttention):
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
+        # change to remove repeat
+        attn_output = attn_output.view(bsz, self.num_heads, q_len, self.head_dim)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
@@ -516,17 +533,32 @@ class RebelDynamicCache(DynamicCache):
         if len(self.key_cache) <= layer_idx:
             self.key_cache.append(key_states)
             self.value_cache.append(value_states)
+            return self.key_cache[layer_idx], self.value_cache[layer_idx]
         else:
-            self.key_cache[layer_idx] = self.key_cache[layer_idx].slice_scatter(
-                key_states, dim=2, start=self.current_step, end=self.current_step + key_states.shape[2]
+            # change to remove repeat
+            # self.key_cache[layer_idx] = self.key_cache[layer_idx].slice_scatter(
+            #     key_states, dim=2, start=self.current_step, end=self.current_step + key_states.shape[2]
+            # )
+            # self.value_cache[layer_idx] = self.value_cache[layer_idx].slice_scatter(
+            #     value_states, dim=2, start=self.current_step, end=self.current_step + value_states.shape[2]
+            # )
+            updated_key = (
+                self.key_cache[layer_idx]
+                .unsqueeze(2)
+                .slice_scatter(
+                    key_states, dim=-2, start=self.current_step, end=self.current_step + key_states.shape[-2]
+                )
             )
-            self.value_cache[layer_idx] = self.value_cache[layer_idx].slice_scatter(
-                value_states, dim=2, start=self.current_step, end=self.current_step + value_states.shape[2]
+            updated_value = (
+                self.value_cache[layer_idx]
+                .unsqueeze(2)
+                .slice_scatter(
+                    value_states, dim=-2, start=self.current_step, end=self.current_step + value_states.shape[-2]
+                )
             )
-            # self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-            # self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
-        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+            self.key_cache[layer_idx] = updated_key.squeeze(2)
+            self.value_cache[layer_idx] = updated_value.squeeze(2)
+            return updated_key, updated_value
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""

optimum-rbln 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

optimum-rbln 0.1.1py3-none-any.whl → 0.1.4py3-none-any.whl