PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.4a2__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.4a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 from collections import deque
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 import rebel
 import torch
@@ -24,6 +24,10 @@ from ...modeling_outputs import RBLNDecoderOnlyOutput
 from .configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
+if TYPE_CHECKING:
+    from transformers.configuration_utils import PreTrainedConfig
 class RBLNPageTableManager:
     EMPTY_BLOCK = -1
     NO_BLOCKS_ERROR = (
@@ -46,6 +50,12 @@ class RBLNPageTableManager:
         """
         If the block is empty (empty_block), allocates a block from the free_block_pool.
         """
+        if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
+            raise IndexError(
+                f"Invalid index(batch_idx={batch_idx}, block_idx={block_idx}): \n \
+                               BlockTable Shape(batch_axis, block_axis): {self.block_tables.shape}, BlockSize: {self.rbln_config.kvcache_block_size}"
+            )
         if self.block_tables[batch_idx][block_idx] == self.EMPTY_BLOCK:
             if self.free_block_pool:
                 block = self.free_block_pool.popleft()
@@ -96,8 +106,6 @@ class RBLNPageTableManager:
                 s, e = cache_position[0][0].item(), cache_position[0][-1].item()
                 for position in range(s, e + 1, self.rbln_config.kvcache_block_size):
                     block_idx = position // self.rbln_config.kvcache_block_size
-                    if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
-                        raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
                     self.update_block(batch_idx, block_idx)
                 return self.replace_empty_block(self.block_tables[batch_idx])
@@ -169,20 +177,23 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         dec_attn_mask: torch.Tensor,
         page_table_manager: RBLNPageTableManager,
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
-        out_buffers: Optional[torch.Tensor] = None,
+        config: "PreTrainedConfig" = None,
+        logits_last_dim: Optional[int] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(runtime, **kwargs)
         self.phase = phase
         self.batch_size = batch_size
         self.rbln_config = rbln_config
+        self.config = config
+        self.logits_last_dim = logits_last_dim
         # shared resources between prefill and decode phase
         self.dec_attn_mask = dec_attn_mask
         self.page_table_manager = page_table_manager
+        self.out_buffers = None
         if self.phase == "prefill":
-            self.out_buffers = out_buffers
             self.causal_mask = 1 - torch.triu(
                 torch.ones(1, 1, self.rbln_config.prefill_chunk_size, self.rbln_config.prefill_chunk_size), diagonal=1
             )
@@ -276,28 +287,48 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         if lora_int_ids is not None and lora_int_ids.shape[0] != self.batch_size:
             raise ValueError(f"lora_int_ids size mismatch: got {lora_int_ids.shape[0]}, expected {self.batch_size}.")
-        if self.batch_size != cache_position.shape[0]:
+        batch_size = inputs.shape[0]
+        if batch_size != self.batch_size:
             raise RuntimeError(
-                f"Cache position size mismatch: got {cache_position.shape[0]}, expected {self.batch_size}."
+                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
+            )
+        if batch_size != cache_position.shape[0]:
+            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
+        if self.rbln_config.use_local_attention:
+            local_block_tables = (
+                local_block_tables
+                if local_block_tables is not None
+                else torch.arange(0, batch_size, dtype=torch.int16).view(batch_size, -1)
             )
         if self.rbln_config.use_attention_mask and attention_mask is None:
-            for b_idx in range(self.batch_size):
+            for b_idx in range(batch_size):
                 decoding_step = cache_position[b_idx].item()
                 if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
                     raise ValueError(
                         f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
                     )
-                if is_external_block_tables:
-                    self.dec_attn_mask[b_idx].fill_(0)
-                    self.dec_attn_mask[b_idx, :, :, : decoding_step + 1] = 1
+                if self.rbln_config.use_position_ids:
+                    self.dec_attn_mask[b_idx, decoding_step] = 1
+                    if self.batch_size < block_tables.shape[0]:
+                        block_tables = block_tables[: self.batch_size]
+                    if self.dec_attn_mask is not None and self.batch_size < self.dec_attn_mask.shape[0]:
+                        self.dec_attn_mask = self.dec_attn_mask[: self.batch_size]
                 else:
-                    self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
+                    if is_external_block_tables:
+                        self.dec_attn_mask[b_idx].fill_(0)
+                        self.dec_attn_mask[b_idx, :, :, : decoding_step + 1] = 1
+                    else:
+                        self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
             attention_mask = self.dec_attn_mask
-        logits = super().forward(
+        outputs = super().forward(
             inputs,
             cache_position,
             block_tables,
@@ -306,15 +337,20 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             attention_mask if self.rbln_config.use_attention_mask else None,
             position_ids if self.rbln_config.use_position_ids else None,
             lora_int_ids if self.rbln_config.use_lora else None,
+            out=self.out_buffers,
         )
-        return RBLNDecoderOnlyOutput(logits=logits)
+        if self.rbln_config.output_hidden_states:
+            return RBLNDecoderOnlyOutput(logits=outputs[0], hidden_states=tuple(outputs[1:]))
+        else:
+            return RBLNDecoderOnlyOutput(logits=outputs, hidden_states=None)
     def _prepare_prefill_inputs(
         self,
         inputs: torch.Tensor,
         cache_position: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         position_embed: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
     ):
@@ -324,9 +360,27 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         # Handle continuous batching in a compiled graph by extracting valid inputs
         # If an attention mask is provided, select only the valid (non-masked) inputs
         if attention_mask is not None:
-            inputs = inputs[:, attention_mask.bool()]
-            position_embed = None if position_embed is None else position_embed[:, :, :, attention_mask.bool(), :]
-            token_type_ids = None if token_type_ids is None else token_type_ids[:, attention_mask.bool()]
+            if attention_mask.dim() != 1:
+                raise ValueError("attention_mask must be a 1D tensor.")
+            mask_bool = attention_mask.to(dtype=torch.bool)
+            if (~mask_bool).any():
+                indice_one = torch.nonzero(mask_bool, as_tuple=False)
+                if indice_one.numel() == 0:
+                    raise ValueError("attention_mask with padding must include at least one real token.")
+                first_one_idx, last_one_idx = int(indice_one[0].item()), int(indice_one[-1].item())
+                if last_one_idx - first_one_idx + 1 != mask_bool.sum():
+                    raise ValueError(
+                        "attention_mask must group all 1s together (e.g. 000111 or 1111000). "
+                        "Zeros between real tokens like 101010 are not supported."
+                    )
+                if self.rbln_config.can_generate and not mask_bool[first_one_idx:].all():
+                    raise ValueError("attention_mask must be left padded for generation.")
+            inputs = inputs[:, mask_bool]
+            position_embed = None if position_embed is None else position_embed[:, :, :, mask_bool, :]
+            token_type_ids = None if token_type_ids is None else token_type_ids[:, mask_bool]
         query_length = inputs.shape[1]
         if query_length > self.rbln_config.max_seq_len:
@@ -335,17 +389,21 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             )
         # Initialize attention mask for chunked processing
-        chunked_attention_mask = (
-            torch.zeros(
-                1,
-                1,
-                self.rbln_config.prefill_chunk_size,
-                self.rbln_config.max_seq_len,
-                dtype=self.rbln_config.torch_dtype,
-            )
-            if self.rbln_config.use_attention_mask
-            else None
-        )
+        if self.rbln_config.use_attention_mask:
+            if self.rbln_config.use_position_ids:
+                chunked_attention_mask = torch.zeros(
+                    1, self.rbln_config.max_seq_len, dtype=self.rbln_config.torch_dtype
+                )
+            else:
+                chunked_attention_mask = torch.zeros(
+                    1,
+                    1,
+                    self.rbln_config.prefill_chunk_size,
+                    self.rbln_config.max_seq_len,
+                    dtype=self.rbln_config.torch_dtype,
+                )
+        else:
+            chunked_attention_mask = None
         cache_position = (
             torch.arange(query_length, dtype=torch.int32).unsqueeze(0) if cache_position is None else cache_position
@@ -363,7 +421,11 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             cache_position = F.pad(cache_position, (0, padding_size))
         # Overwrite position_ids and padded_cache_lengths
-        position_ids = cache_position.clone() if self.rbln_config.use_position_ids else None
+        if self.rbln_config.use_position_ids and position_ids is None:
+            position_ids = cache_position.clone()
+        else:
+            position_ids = position_ids
         padded_cache_lengths = 0
         return (
@@ -377,6 +439,68 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             token_type_ids,
         )
+    def _prepare_prefill_outputs(
+        self,
+        query_length: int,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Prepare out buffers
+        padding_size = (self.rbln_config.prefill_chunk_size - query_length) % self.rbln_config.prefill_chunk_size
+        padded_input_length = query_length + padding_size
+        padded_mask_length = (
+            attention_mask.shape[-1] + padding_size if attention_mask is not None else padded_input_length
+        )
+        out_buffers = [[] for _ in range(padded_input_length // self.rbln_config.prefill_chunk_size)]
+        valid_start_index = (
+            int(torch.nonzero(attention_mask, as_tuple=False)[0][0].item()) if attention_mask is not None else 0
+        )
+        if self.logits_last_dim is None:
+            logits_last_dim = self.config.vocab_size if self.rbln_config.can_generate else self.config.hidden_size
+        else:
+            logits_last_dim = self.logits_last_dim
+        # Prepare logits buffer
+        logits_size = (
+            1,
+            1 if self.rbln_config.logits_to_keep == 1 else padded_mask_length,
+            logits_last_dim,
+        )
+        output_logits = torch.full(logits_size, fill_value=1e-10, dtype=self.rbln_config.torch_dtype)
+        if self.rbln_config.logits_to_keep == 1:
+            for i in range(padded_input_length // self.rbln_config.prefill_chunk_size):
+                out_buffers[i].append(output_logits)
+        else:
+            for i in range(padded_input_length // self.rbln_config.prefill_chunk_size):
+                s_idx = i * self.rbln_config.prefill_chunk_size + valid_start_index
+                out_buffers[i].append(output_logits[:, s_idx : s_idx + self.rbln_config.prefill_chunk_size])
+        # Prepare output hidden states
+        output_hidden_states = None
+        if self.rbln_config.output_hidden_states:
+            hidden_states_size = (
+                1,
+                padded_mask_length,
+                self.config.hidden_size,
+            )
+            output_hidden_states = [
+                torch.full(hidden_states_size, fill_value=1e-10, dtype=self.rbln_config.torch_dtype)
+                for _ in range(self.config.num_hidden_layers + 1)
+            ]
+            for i in range(padded_input_length // self.rbln_config.prefill_chunk_size):
+                s_idx = i * self.rbln_config.prefill_chunk_size + valid_start_index
+                out_buffers[i].extend(
+                    [
+                        hidden_states_buffer[:, s_idx : s_idx + self.rbln_config.prefill_chunk_size]
+                        for hidden_states_buffer in output_hidden_states
+                    ]
+                )
+        return out_buffers, output_logits, output_hidden_states
     def prefill_forward(
         self,
         inputs: torch.Tensor,
@@ -385,6 +509,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         batch_idx: Optional[int] = None,
         block_tables: Optional[torch.Tensor] = None,
         is_external_block_tables: Optional[bool] = None,
+        position_ids: Optional[torch.Tensor] = None,
         position_embed: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
@@ -417,9 +542,11 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             query_length,
             token_type_ids,
         ) = self._prepare_prefill_inputs(
-            inputs, cache_position, attention_mask, position_embed, token_type_ids=token_type_ids
+            inputs, cache_position, attention_mask, position_ids, position_embed, token_type_ids=token_type_ids
         )
+        out_buffers, output_logits, output_hidden_states = self._prepare_prefill_outputs(query_length, attention_mask)
         # Assumed that prefix caching was performed externally if cache_position doesn't start from 0.
         prefix_cached_len = cache_position[0][0].item()
         if prefix_cached_len > 0:
@@ -428,11 +555,13 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                     "Prefix Caching is not supported yet for non-multiple of prefill_chunk_size."
                 )
             if self.rbln_config.use_attention_mask:
-                chunked_attention_mask[:, :, :, :prefix_cached_len] = 1
+                if self.rbln_config.use_position_ids:
+                    chunked_attention_mask[:, :prefix_cached_len] = 1
+                else:
+                    chunked_attention_mask[:, :, :, :prefix_cached_len] = 1
         # Process input in chunks of size `prefill_chunk_size`
-        output_logits = []
-        for step in range(0, query_length, self.rbln_config.prefill_chunk_size):
+        for i, step in enumerate(range(0, query_length, self.rbln_config.prefill_chunk_size)):
             s, e = step, step + self.rbln_config.prefill_chunk_size
             # Extract the current chunk of inputs, cache positions, position ids, and position embeddings
             input_chunk = inputs[:, s:e]
@@ -441,17 +570,29 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             position_embed_chunk = position_embed[:, :, :, s:e, :] if position_embed is not None else None
             # Update attention mask to ensure proper causal behavior
-            if self.rbln_config.use_attention_mask and not self.rbln_config.use_position_ids:
-                if step > 0:  # update previous chunk
-                    chunked_attention_mask[
-                        :,
-                        :,
-                        :,
-                        s - self.rbln_config.prefill_chunk_size + prefix_cached_len : e
-                        - self.rbln_config.prefill_chunk_size
-                        + prefix_cached_len,
-                    ] = 1
-                chunked_attention_mask[:, :, :, s + prefix_cached_len : e + prefix_cached_len] = self.causal_mask
+            if self.rbln_config.use_attention_mask:
+                if self.rbln_config.use_position_ids:
+                    if step > 0:  # update previous chunk
+                        # Update attention mask for the previous chunk (from s - prefill_chunk_size to s)
+                        prev_chunk_start = s - self.rbln_config.prefill_chunk_size + prefix_cached_len
+                        prev_chunk_end = s + prefix_cached_len
+                        chunked_attention_mask[:, prev_chunk_start:prev_chunk_end] = 1
+                    current_chunk_start = s + prefix_cached_len
+                    current_chunk_end = min(e, query_length) + prefix_cached_len
+                    if current_chunk_end > current_chunk_start:
+                        chunked_attention_mask[:, current_chunk_start:current_chunk_end] = 1
+                else:
+                    if step > 0:  # update previous chunk
+                        # Update attention mask for the previous chunk (from s - prefill_chunk_size to s)
+                        prev_chunk_start = s - self.rbln_config.prefill_chunk_size + prefix_cached_len
+                        prev_chunk_end = s + prefix_cached_len
+                        chunked_attention_mask[:, :, :, prev_chunk_start:prev_chunk_end] = 1
+                    current_chunk_start = s + prefix_cached_len
+                    current_chunk_end = e + prefix_cached_len
+                    chunked_attention_mask[:, :, :, current_chunk_start:current_chunk_end] = self.causal_mask
             # Calculate query position if needed
             if self.rbln_config.use_local_attention or self.rbln_config.logits_to_keep > 0:
@@ -464,7 +605,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 query_position = None
             # Forward pass for the current chunk
-            output_logit = super().forward(
+            _ = super().forward(
                 input_chunk,
                 cache_pos_chunk,
                 block_tables,
@@ -474,31 +615,33 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 chunked_attention_mask if self.rbln_config.use_attention_mask else None,
                 position_ids_chunk,
                 lora_int_ids if self.rbln_config.use_lora else None,
-                out=self.out_buffers,
+                out=out_buffers[i],
             )
-            output_logits.append(output_logit)
         # Aggregate output_logits
-        output_logits = torch.concat(output_logits, dim=-2)
-        if self.rbln_config.logits_to_keep > 0:
-            output_logits = output_logits[:, -self.rbln_config.logits_to_keep :, :]
+        padding_size = (self.rbln_config.prefill_chunk_size - query_length) % self.rbln_config.prefill_chunk_size
+        if self.rbln_config.logits_to_keep == 1:
+            output_logits = output_logits
+        elif self.rbln_config.logits_to_keep > 1:
+            output_logits = output_logits[:, -padding_size - self.rbln_config.logits_to_keep : -padding_size, :]
         else:
-            output_logits = output_logits[:, :query_length, :]
-            # index copy for masked output_logits
-            if attention_mask is not None:
-                new_output_logits = torch.full(
-                    (1, attention_mask.shape[-1], output_logits.shape[-1]),
-                    fill_value=1e-10,
-                    dtype=output_logits.dtype,
-                )
-                mask_indices = torch.nonzero(attention_mask, as_tuple=True)[0]
-                new_output_logits.index_copy_(dim=-2, index=mask_indices, source=output_logits)
+            output_logits = output_logits[:, :-padding_size, :]
-            output_logits = new_output_logits
+        all_hidden_states = None
+        if self.rbln_config.output_hidden_states:
+            all_hidden_states = [
+                output_hidden_state[:, :-padding_size, :] for output_hidden_state in output_hidden_states
+            ]
+            all_hidden_states = tuple(all_hidden_states)
         # Update decoder attention mask with processed KV-cache length from prefill phase
         if self.rbln_config.can_generate and not is_external_block_tables and self.rbln_config.use_attention_mask:
-            self.dec_attn_mask[batch_idx].fill_(0)
-            self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
+            if self.rbln_config.use_position_ids:
+                self.dec_attn_mask[batch_idx : batch_idx + 1] = chunked_attention_mask
+            else:
+                self.dec_attn_mask[batch_idx].fill_(0)
+                self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
-        return RBLNDecoderOnlyOutput(logits=output_logits, padded_cache_lengths=padded_cache_lengths)
+        return RBLNDecoderOnlyOutput(
+            logits=output_logits, padded_cache_lengths=padded_cache_lengths, hidden_states=all_hidden_states
+        )

optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py CHANGED Viewed

@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 import torch
+from transformers import GenerationConfig
 from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import ModelOutput
 if TYPE_CHECKING:
@@ -91,20 +93,26 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
         self,
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
-        max_length: Optional[int] = None,
+        generation_config: Optional[GenerationConfig] = None,
         **kwargs,
-    ):
+    ) -> Union[ModelOutput, torch.LongTensor]:
         """
         The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
+        Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) for more details.
         Args:
-            input_ids: The input ids to the model.
-            attention_mask: The attention mask to the model.
-            max_length: The maximum length of the generated text.
-            kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
+            input_ids (torch.LongTensor): The input ids to the model.
+            attention_mask (torch.LongTensor, optional): The attention mask to the model.
+            generation_config (GenerationConfig, optional): The generation configuration to be used as base parametrization for the generation call. **kwargs passed to generate matching the attributes of generation_config will override them.
+                If generation_config is not provided, the default will be used, which had the following loading priority: 1) from the generation_config.json model file, if it exists; 2) from the model configuration.
+                Please note that unspecified parameters will inherit [GenerationConfig](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig)’s default values.
+            kwargs (dict[str, Any], optional): Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
+        Returns:
+            A ModelOutput (if return_dict_in_generate=True or when config.return_dict_in_generate=True) or a torch.LongTensor.
         """
-        if max_length is not None:
-            kwargs["max_length"] = max_length
+        if generation_config is not None:
+            kwargs["generation_config"] = generation_config
         if attention_mask is not None:
             kwargs["attention_mask"] = attention_mask

optimum/rbln/transformers/models/decoderonly/lora_architecture.py CHANGED Viewed

@@ -142,7 +142,7 @@ class LoRALinear(nn.Module):
         padded_lora_a = []
         padded_lora_b = []
-        for i, (lora_a, lora_b) in enumerate(zip(lora_a_weights, lora_b_weights)):
+        for _, (lora_a, lora_b) in enumerate(zip(lora_a_weights, lora_b_weights)):
             current_rank = lora_a.shape[0]
             if current_rank < max_rank:
                 # Pad with zeros

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.4a2__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.4a2py3-none-any.whl