PyPI - optimum-rbln - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

optimum-rbln 0.1.0py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -23,7 +23,6 @@
 import inspect
 import logging
-import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
@@ -32,19 +31,13 @@ import rebel
 import torch
 from optimum.exporters import TasksManager
 from transformers import AutoModelForCausalLM, GPT2LMHeadModel, PretrainedConfig
-from transformers.generation.logits_process import LogitsProcessorList
-from transformers.generation.stopping_criteria import (
-    StoppingCriteriaList,
-    validate_stopping_criteria,
-)
-from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import SampleDecoderOnlyOutput
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions, Seq2SeqLMOutput
 from ....modeling_base import RBLNBaseModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.save_utils import maybe_save_preprocessors
+from ...generation.utils import RBLNGenerationMixin
 from .gpt2_architecture import GPT2LMHeadModelWrapper
@@ -66,7 +59,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
         return Seq2SeqLMOutput(logits=logits)
-class RBLNGPT2LMHeadModel(RBLNBaseModel):
+class RBLNGPT2LMHeadModel(RBLNBaseModel, RBLNGenerationMixin):
     """
     The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
@@ -135,6 +128,7 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ) -> "RBLNGPT2LMHeadModel":
         """
@@ -144,8 +138,16 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
         if task is None:
             task = TasksManager.infer_task_from_model(cls.auto_model_class)
-        save_dir = TemporaryDirectory()
-        save_dir_path = Path(save_dir.name)
+        if model_save_dir is None:
+            save_dir = TemporaryDirectory()
+            save_dir_path = Path(save_dir.name)
+        else:
+            save_dir = model_save_dir
+            if isinstance(save_dir, TemporaryDirectory):
+                save_dir_path = Path(model_save_dir.name)
+            else:
+                save_dir_path = Path(model_save_dir)
+                save_dir_path.mkdir(exist_ok=True)
         kwargs.update(
             {
@@ -264,8 +266,7 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
         meta["rbln_max_seq_len"] = rbln_max_seq_len
         meta["rbln_pad_token_id"] = rbln_pad_token_id
-        if rbln_batch_size is None:
-            rbln_batch_size = 1
+        rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
         def get_input_info(query_length):
             return [
@@ -320,6 +321,7 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
             self.prompt_ids = input_ids
             self.rightpad_max_len = cur_len
             prompt_min_len = torch.min(torch.sum(attention_mask, dim=-1))
+            self.dummy_len = torch.sum(attention_mask, dim=-1) - prompt_min_len
             if cur_len % self.prefill_chunk_size == 0:
                 pad_len = 0
@@ -329,12 +331,12 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
             attention_mask = self.prefill_attention_mask.clone()
             cache_position = torch.tensor(past_cached_length, dtype=torch.int32)
-            query_length = prompt_min_len
+            query_length = prompt_min_len.item()
         else:
             cache_position = torch.tensor(past_cached_length, dtype=torch.int32)
             attention_mask = torch.zeros(batch_size, 1, 1, self.max_seq_len, dtype=torch.int64)
             attention_mask[:, :, :, : cache_position + 1] = 1
-            input_ids = input_ids[:, -1:].contiguous()
+            input_ids = input_ids[:, cache_position : cache_position + 1].contiguous()
             query_length = 1
         model_inputs = {
@@ -357,25 +359,23 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
         query_length: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
         if past_key_values is not None:
             past_key_values += query_length
         if cache_position == 0:
-            for _ in range(0, query_length, self.prefill_chunk_size):
-                sliced_input_ids = input_ids[:, cache_position : cache_position + self.prefill_chunk_size]
-                attention_mask[:, :, :, :cache_position] = 1
-                attention_mask[:, :, :, cache_position : cache_position + self.prefill_chunk_size] = self.causal_mask
+            for step in range(0, query_length, self.prefill_chunk_size):
+                sliced_input_ids = input_ids[:, step : step + self.prefill_chunk_size]
+                attention_mask[:, :, :, :step] = 1
+                attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
                 output = self.prefill_decoder(
                     input_ids=sliced_input_ids.contiguous(),
                     attention_mask=attention_mask.contiguous(),
-                    cache_position=cache_position,
+                    cache_position=cache_position + step,
                 )
-                query_length -= self.prefill_chunk_size
-                cache_position += self.prefill_chunk_size
-            output = output.logits[:, query_length - 1].unsqueeze(1)
+            idx = query_length % self.prefill_chunk_size - 1
+            output = output.logits[:, idx].unsqueeze(1)
         else:
             output = self.decoder(
@@ -389,312 +389,3 @@ class RBLNGPT2LMHeadModel(RBLNBaseModel):
     def __repr__(self):
         return repr(self.runtimes[0]) + "\n" + repr(self.runtimes[1])
-    # call 'greedy_search` directly is deprecated and removed in v4.41.
-    def greedy_search(self, *args, **kwargs):
-        return self._greedy_search(*args, **kwargs)
-    def _greedy_search(
-        self,
-        input_ids: torch.LongTensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        **model_kwargs,
-    ) -> Union[SampleDecoderOnlyOutput, torch.LongTensor]:
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            warnings.warn(
-                "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-        return_dict_in_generate = (
-            return_dict_in_generate
-            if return_dict_in_generate is not None
-            else self.generation_config.return_dict_in_generate
-        )
-        # init attention / hidden states / scores tuples
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        # keep track of which sequences are already finished
-        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
-        this_peer_finished = False  # used by synced_gpus only
-        while True:
-            # prepare model inputs
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-            )
-            next_token_logits = outputs.logits[:, -1, :]
-            # pre-process distribution
-            next_tokens_scores = logits_processor(input_ids, next_token_logits)
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-            # argmax
-            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
-            # finished sentences should have their next token be a padding token
-            if eos_token_id is not None:
-                if pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-            ########################################################################################################
-            # thkim change for right-padding batch
-            # if min_input_len <= update_idx < max_input_len
-            #   update validate input_ids[:,update_idx]
-            # TODO : raw_logits contains dummy next_token's logits
-            update_idx = model_inputs["cache_position"] + model_inputs["query_length"]
-            if update_idx < self.rightpad_max_len:
-                # update exist input_ids rather than concat
-                valid_indices = model_kwargs["attention_mask"][:, update_idx] == 0
-                input_ids[valid_indices, update_idx] = next_tokens[valid_indices]
-                model_kwargs["attention_mask"][valid_indices, update_idx] = 1
-                # dummy next_token -> pad_token_id for streamer
-                # in order to skip by 'skip_special_tokens = True"
-                dummy_indices = ~valid_indices
-                next_tokens[dummy_indices] = pad_token_id
-            else:
-                ############################################END#########################################################
-                # update generated ids, model inputs, and length for next step
-                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-                model_kwargs = self._update_model_kwargs_for_generation(
-                    outputs,
-                    model_kwargs,
-                    is_encoder_decoder=self.config.is_encoder_decoder,
-                )
-            if streamer is not None:
-                streamer.put(next_tokens.cpu())
-            # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id_tensor is not None:
-                ####################################################################
-                # thkim : to do not finish sequence of dummy_decoder of right_padding
-                if hasattr(self, "rightpad_max_len"):
-                    update_idx = model_inputs["cache_position"] + model_inputs["query_length"]
-                    if update_idx < self.rightpad_max_len:
-                        next_tokens += model_kwargs["attention_mask"][:, update_idx] * eos_token_id_tensor
-                ######################################################################
-                unfinished_sequences = unfinished_sequences.mul(
-                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-                )
-                # stop when each sentence is finished
-                if unfinished_sequences.max() == 0:
-                    this_peer_finished = True
-            # stop if we exceed the maximum length
-            # thkim : backward compatibility bool vs torch.BoolTensor
-            is_stop = stopping_criteria(input_ids, None)
-            if isinstance(is_stop, torch.BoolTensor):
-                is_stop = torch.all(is_stop)
-            if is_stop:
-                this_peer_finished = True
-            if this_peer_finished:
-                break
-        if streamer is not None:
-            streamer.end()
-        if return_dict_in_generate:
-            return SampleDecoderOnlyOutput(
-                sequences=input_ids,
-                logits=raw_logits,
-            )
-        else:
-            return input_ids
-    # call 'sample` directly is deprecated and removed in v4.41.
-    def sample(self, *args, **kwargs):
-        return self._sample(*args, **kwargs)
-    def _sample(
-        self,
-        input_ids: torch.LongTensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_warper: Optional[LogitsProcessorList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        streamer: Optional["BaseStreamer"] = None,
-        **model_kwargs,
-    ) -> Union[SampleDecoderOnlyOutput, torch.LongTensor]:
-        # init values
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        if max_length is not None:
-            warnings.warn(
-                "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
-                UserWarning,
-            )
-            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
-        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
-        pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-        output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
-        output_logits = output_logits if output_logits is not None else False
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        # keep track of which sequences are already finished
-        batch_size, cur_len = input_ids.shape
-        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-        this_peer_finished = False
-        # model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
-        while True:
-            # prepare model inputs
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-            )
-            next_token_logits = outputs.logits[:, -1, :]
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_token_scores,)
-                if output_logits:
-                    raw_logits += (next_token_logits,)
-            # sample
-            probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
-            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            # finished sentences should have their next token be a padding token
-            if eos_token_id is not None:
-                if pad_token_id is None:
-                    raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-            ########################################################################################################
-            # thkim change for right-padding batch
-            # if min_input_len <= update_idx < max_input_len
-            #   update validate input_ids[:,update_idx]
-            # TODO : raw_logits contains dummy next_token's logits
-            update_idx = model_inputs["cache_position"] + model_inputs["query_length"]
-            if update_idx < self.rightpad_max_len:
-                # update exist input_ids rather than concat
-                valid_indices = model_kwargs["attention_mask"][:, update_idx] == 0
-                input_ids[valid_indices, update_idx] = next_tokens[valid_indices]
-                model_kwargs["attention_mask"][valid_indices, update_idx] = 1
-                # dummy next_token -> pad_token_id for streamer
-                # in order to skip by 'skip_special_tokens = True"
-                dummy_indices = ~valid_indices
-                next_tokens[dummy_indices] = pad_token_id
-            else:
-                ############################################END#########################################################
-                # update generated ids, model inputs, and length for next step
-                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-                model_kwargs = self._update_model_kwargs_for_generation(
-                    outputs,
-                    model_kwargs,
-                    is_encoder_decoder=self.config.is_encoder_decoder,
-                )
-            if streamer is not None:
-                streamer.put(next_tokens.cpu())
-            # if eos_token was found in one sentence, set sentence to finished
-            if eos_token_id_tensor is not None:
-                ####################################################################
-                # thkim : to do not finish sequence of dummy_decoder of right_padding
-                if hasattr(self, "rightpad_max_len"):
-                    update_idx = model_inputs["cache_position"] + model_inputs["query_length"]
-                    if update_idx < self.rightpad_max_len:
-                        next_tokens += model_kwargs["attention_mask"][:, update_idx] * eos_token_id_tensor
-                ######################################################################
-                unfinished_sequences = unfinished_sequences.mul(
-                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-                )
-                # stop when each sentence is finished
-                if unfinished_sequences.max() == 0:
-                    this_peer_finished = True
-            # stop if we exceed the maximum length
-            # thkim : backward compatibility bool vs list[bool]
-            is_stop = stopping_criteria(input_ids, None)
-            if isinstance(is_stop, torch.BoolTensor):
-                is_stop = torch.all(is_stop)
-            if is_stop:
-                this_peer_finished = True
-            if this_peer_finished:
-                break
-        if streamer is not None:
-            streamer.end()
-        if return_dict_in_generate:
-            return SampleDecoderOnlyOutput(
-                sequences=input_ids,
-                scores=scores,
-                logits=raw_logits,
-            )
-        else:
-            return input_ids

optimum/rbln/transformers/models/llama/llama_architecture.py CHANGED Viewed

@@ -36,7 +36,6 @@ from transformers.models.llama.modeling_llama import (
     LlamaForCausalLM,
     LlamaModel,
     LlamaRotaryEmbedding,
-    repeat_kv,
 )
@@ -149,26 +148,41 @@ class _LlamaAttention(LlamaAttention):
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # change to remove repeat
+        key_states = key_states.unsqueeze(2)
+        value_states = value_states.unsqueeze(2)
+        query_states = query_states.view(
+            bsz, self.num_key_value_heads, self.num_heads // self.num_key_value_heads, q_len, self.head_dim
+        )
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # change to remove repeat
+        # key_states = repeat_kv(key_states, self.num_key_value_groups)
+        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
+        attn_weights = torch.matmul(query_states, key_states.transpose(3, 4)) / math.sqrt(self.head_dim)
+        # change to remove repeat
+        # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        #     raise ValueError(
+        #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+        #         f" {attn_weights.size()}"
+        #     )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
+            else:
+                # change to remove repeat
+                attention_mask = attention_mask.unsqueeze(2)
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
@@ -176,6 +190,9 @@ class _LlamaAttention(LlamaAttention):
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
+        # change to remove repeat
+        attn_output = attn_output.view(bsz, self.num_heads, q_len, self.head_dim)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
@@ -516,17 +533,32 @@ class RebelDynamicCache(DynamicCache):
         if len(self.key_cache) <= layer_idx:
             self.key_cache.append(key_states)
             self.value_cache.append(value_states)
+            return self.key_cache[layer_idx], self.value_cache[layer_idx]
         else:
-            self.key_cache[layer_idx] = self.key_cache[layer_idx].slice_scatter(
-                key_states, dim=2, start=self.current_step, end=self.current_step + key_states.shape[2]
+            # change to remove repeat
+            # self.key_cache[layer_idx] = self.key_cache[layer_idx].slice_scatter(
+            #     key_states, dim=2, start=self.current_step, end=self.current_step + key_states.shape[2]
+            # )
+            # self.value_cache[layer_idx] = self.value_cache[layer_idx].slice_scatter(
+            #     value_states, dim=2, start=self.current_step, end=self.current_step + value_states.shape[2]
+            # )
+            updated_key = (
+                self.key_cache[layer_idx]
+                .unsqueeze(2)
+                .slice_scatter(
+                    key_states, dim=-2, start=self.current_step, end=self.current_step + key_states.shape[-2]
+                )
             )
-            self.value_cache[layer_idx] = self.value_cache[layer_idx].slice_scatter(
-                value_states, dim=2, start=self.current_step, end=self.current_step + value_states.shape[2]
+            updated_value = (
+                self.value_cache[layer_idx]
+                .unsqueeze(2)
+                .slice_scatter(
+                    value_states, dim=-2, start=self.current_step, end=self.current_step + value_states.shape[-2]
+                )
             )
-            # self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-            # self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
-        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+            self.key_cache[layer_idx] = updated_key.squeeze(2)
+            self.value_cache[layer_idx] = updated_value.squeeze(2)
+            return updated_key, updated_value
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""

optimum-rbln 0.1.0__py3-none-any.whl → 0.1.4__py3-none-any.whl

optimum-rbln 0.1.0py3-none-any.whl → 0.1.4py3-none-any.whl