PyPI - optimum-rbln - Versions diffs - 0.7.3a2__py3-none-any.whl → 0.7.3a3__py3-none-any.whl - Mend

optimum-rbln 0.7.3a2py3-none-any.whl → 0.7.3a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

optimum/rbln/__version__.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.7.3a2'
-__version_tuple__ = version_tuple = (0, 7, 3)
+__version__ = version = '0.7.3a3'
+__version_tuple__ = version_tuple = (0, 7, 3, 'a3')

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -54,7 +54,6 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         block_tables: torch.Tensor,
         free_block_pool: Deque,
         kvcache_block_size: int,
-        kvcache_num_blocks: int,
         use_attention_mask: bool,
         attn_impl: str,
         **kwargs: Any,
@@ -72,7 +71,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         self.free_block_pool = free_block_pool
         self.kvcache_block_size = kvcache_block_size
-        self.empty_block = kvcache_num_blocks - 1
+        self.empty_block = -1
         self.attn_impl = attn_impl
         if self.phase == "prefill":
@@ -97,58 +96,61 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             torch.Tensor: Updated block tables.
         """
-        def update_block(batch_idx, block_idx):
+        NO_BLOCKS_ERROR = (
+            "No memory blocks are available for allocation."
+            "The generate() API cannot complete this inference task because Paged Attention is not fully supported by optimum-rbln."
+            "This is supported by vllm-rbln (see: https://docs.rbln.ai/software/model_serving/vllm_support/vllm-rbln.html)."
+            "Using vllm-rbln should fix this issue and enhance inference performance."
+        )
+        def update_block(batch_idx: int, block_idx: int):
             """
-            Helper function to update the block table for a given batch index and block index.
             If the block is empty (empty_block), allocates a block from the free_block_pool.
-            Args:
-                batch_idx (int): Batch index.
-                block_idx (int): Block index.
-            Raises:
-                RuntimeError: Raised if no available blocks are found in the free_block_pool.
             """
             if self.block_tables[batch_idx][block_idx] == self.empty_block:
                 if self.free_block_pool:
                     block = self.free_block_pool.popleft()
                     self.block_tables[batch_idx][block_idx] = block
                 else:
-                    raise RuntimeError("Not available blocks")
+                    raise RuntimeError(NO_BLOCKS_ERROR)
-        if self.attn_impl == "eager":
-            if self.phase == "prefill":
-                return self.block_tables[batch_idx]
+        def replace_empty_block(block_tables: torch.Tensor):
+            """
+            Replaces all occurrences of `self.empty_block` in `block_tables` with a dummy block from `self.free_block_pool`.
+            """
+            if not torch.any(block_tables == self.empty_block):
+                return block_tables.clone()
+            elif self.free_block_pool:
+                _free_block = self.free_block_pool[0]
+                return torch.where(block_tables == self.empty_block, _free_block, block_tables)
             else:
-                return self.block_tables
-        # Case for 'flash_attn' attention implementation
+                raise RuntimeError(NO_BLOCKS_ERROR)
+        if self.phase == "prefill":
+            # Track previously used blocks and return them to the free_block_pool and
+            # reset the current batch's block table to empty blocks
+            prev_blocks = self.block_tables[batch_idx][self.block_tables[batch_idx] != self.empty_block].tolist()
+            self.free_block_pool.extend(prev_blocks)
+            self.block_tables[batch_idx].fill_(self.empty_block)
+            # Get the start (s) and end (e) positions from cache_position and
+            # iterate over the cache positions to allocate necessary blocks
+            s, e = cache_position[0][0].item(), cache_position[0][-1].item()
+            for position in range(s, e + 1, self.kvcache_block_size):
+                block_idx = position // self.kvcache_block_size
+                if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
+                    raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
+                update_block(batch_idx, block_idx)
+            return replace_empty_block(self.block_tables[batch_idx])
+        # Case for 'decoder' phase, iterate over the cache positions to allocate necessary blocks
         else:
-            if self.phase == "prefill":
-                # Track previously used blocks and return them to the free_block_pool and
-                # reset the current batch's block table to empty blocks
-                prev_blocks = self.block_tables[batch_idx][self.block_tables[batch_idx] != self.empty_block].tolist()
-                self.free_block_pool.extend(prev_blocks)
-                self.block_tables[batch_idx].fill_(self.empty_block)
-                # Get the start (s) and end (e) positions from cache_position and
-                # iterate over the cache positions to allocate necessary blocks
-                s, e = cache_position[0][0].item(), cache_position[0][-1].item()
-                for position in range(s, e + 1, self.kvcache_block_size):
-                    block_idx = position // self.kvcache_block_size
-                    if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
-                        raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
-                    update_block(batch_idx, block_idx)
-                return self.block_tables[batch_idx]
-            # Case for 'decoder' phase, iterate over the cache positions to allocate necessary blocks
-            else:
-                for b_idx in range(self.batch_size):
-                    position = cache_position[b_idx][0].item()
-                    block_idx = position // self.kvcache_block_size
-                    update_block(b_idx, block_idx)
+            for b_idx in range(self.batch_size):
+                position = cache_position[b_idx][0].item()
+                block_idx = position // self.kvcache_block_size
+                update_block(b_idx, block_idx)
-                return self.block_tables
+            return replace_empty_block(self.block_tables)
     def forward(
         self,
@@ -380,14 +382,10 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
         dec_attn_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.float32)
-        if attn_impl == "eager":
-            block_tables = torch.arange(0, self.batch_size, dtype=torch.int16).reshape(self.batch_size, 1)
-            free_block_pool = None
-        else:
-            block_tables = torch.zeros(
-                self.batch_size, self.max_seq_len // self.kvcache_block_size, dtype=torch.int16
-            ).fill_(self.kvcache_num_blocks - 1)
-            free_block_pool = deque(x for x in range(self.kvcache_num_blocks - 1))
+        block_tables = torch.zeros(
+            self.batch_size, self.max_seq_len // self.kvcache_block_size, dtype=torch.int16
+        ).fill_(-1)
+        free_block_pool = deque(x for x in range(self.kvcache_num_blocks))
         self.prefill_decoder = RBLNRuntimeModel(
             runtime=self.model[0],
@@ -399,7 +397,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             block_tables=block_tables,
             free_block_pool=free_block_pool,
             kvcache_block_size=self.kvcache_block_size,
-            kvcache_num_blocks=self.kvcache_num_blocks,
             vocab_size=self.config.vocab_size,
             prefill_chunk_size=self.prefill_chunk_size,
             max_seq_len=self.max_seq_len,
@@ -416,7 +413,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             block_tables=block_tables,
             free_block_pool=free_block_pool,
             kvcache_block_size=self.kvcache_block_size,
-            kvcache_num_blocks=self.kvcache_num_blocks,
             use_attention_mask=self.use_attention_mask,
             attn_impl=attn_impl,
         )

optimum/rbln/transformers/models/whisper/whisper_architecture.py CHANGED Viewed

@@ -25,7 +25,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.utils import logging
-from ....ops import register_rbln_custom_cache_update
+from ....ops import register_rbln_custom_add_softmax_attention, register_rbln_custom_cache_update
 logger = logging.get_logger(__name__)
@@ -34,6 +34,7 @@ logger = logging.get_logger(__name__)
 class WhisperWrapper:
     def __init__(self, model, rbln_token_timestamps):
         register_rbln_custom_cache_update()
+        register_rbln_custom_add_softmax_attention()
         self.encoder = WhisperEncoderWrapper(model)
         self.decoder = WhisperDecoderWrapper(model, output_attentions=rbln_token_timestamps)
@@ -213,7 +214,7 @@ class WhisperDecoderLayer(nn.Module):
         # Self Attention Block
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, _, self_present_key_value = self.self_attn(
+        hidden_states, self_present_key_value = self.self_attn(
             hidden_states=hidden_states,
             past_key_value=self_past_key_value,
             attention_mask=attention_mask,
@@ -224,7 +225,7 @@ class WhisperDecoderLayer(nn.Module):
         # Cross-Attention Block
         residual = hidden_states
         hidden_states = self.encoder_attn_layer_norm(hidden_states)
-        hidden_states, cross_attn_weights, cross_present_key_value = self.encoder_attn(
+        hidden_states, cross_attn_weights = self.encoder_attn(
             hidden_states=hidden_states,
             past_key_value=cross_past_key_value,
         )
@@ -258,19 +259,8 @@ class WhisperAttention(nn.Module):
 class WhisperSelfAttention(WhisperAttention):
-    def rbln_cache_update(
-        self,
-        past_key_value: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        cache_position: torch.Tensor,
-    ):
-        s_idx = torch.tensor(cache_position, dtype=torch.int16)
-        axis = torch.tensor(2, dtype=torch.int16)
-        key_states = torch.ops.rbln_custom_ops.rbln_cache_update(past_key_value[0], key_states, s_idx, axis)
-        value_states = torch.ops.rbln_custom_ops.rbln_cache_update(past_key_value[1], value_states, s_idx, axis)
-        return key_states, value_states
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int) -> torch.Tensor:
+        return tensor.view(bsz, seq_len, 1, self.num_heads, self.head_dim).transpose(1, 3)
     def forward(
         self,
@@ -285,22 +275,27 @@ class WhisperSelfAttention(WhisperAttention):
         key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
         value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        key_states, value_states = self.rbln_cache_update(past_key_value, key_states, value_states, cache_position)
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output, key_states, value_states = torch.ops.rbln_custom_ops.add_softmax_attn_decode(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask.unsqueeze(2),
+            past_key_value[0].view(bsz, self.num_heads, 1, -1, self.head_dim),
+            past_key_value[1].view(bsz, self.num_heads, 1, -1, self.head_dim),
+            cache_position.expand(bsz, 1),
+            torch.tensor(1.0, dtype=torch.float32),  # scale
+        )
-        attn_output = torch.matmul(attn_weights, value_states)
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights, (key_states, value_states)
+        return attn_output, (key_states, value_states)
-class WhisperCrossAttention(WhisperSelfAttention):
+class WhisperCrossAttention(WhisperAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -322,4 +317,4 @@ class WhisperCrossAttention(WhisperSelfAttention):
         attn_output = attn_output.reshape(batch_size, query_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights, (key_states, value_states)
+        return attn_output, attn_weights

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.7.3a2
+Version: 0.7.3a3
 Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 optimum/rbln/__init__.py,sha256=eHi15YM3989AcX52jka9rUmgAtlp1PHqMNwBEdOfuu8,6554
-optimum/rbln/__version__.py,sha256=bShBukYvw7AqWtLsut0yClygDEGsFRmxrXypqIeEXcQ,513
+optimum/rbln/__version__.py,sha256=jlkAV1bws10Tgk9b3JF90gq1GOekHphDutCCDtjNFJc,519
 optimum/rbln/modeling.py,sha256=3XE0IrCYbkjw9_Q9BFzZ_ri_Kyxw1g6iwfdohZB46-s,8289
 optimum/rbln/modeling_base.py,sha256=ELSPbjx7awBRM2SckkD-5gI3TIa01mfzz7gDRC1Pljk,21778
 optimum/rbln/modeling_config.py,sha256=7104bxmrvKW4Q6XTruQayiIGl8GHDFmPkJ3cknMIInE,11335
@@ -60,7 +60,7 @@ optimum/rbln/transformers/models/clip/__init__.py,sha256=H9vuBwrmFO0-CqZhXUrKF-u
 optimum/rbln/transformers/models/clip/modeling_clip.py,sha256=NiSm7bHs4SReHDUr53BBWSX0Y8bkKOeUSpsBDrp8YDw,6628
 optimum/rbln/transformers/models/decoderonly/__init__.py,sha256=pDogsdpJKKB5rqnVFrRjwfhUvOSV-jZ3oARMsqSvOOQ,665
 optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=x8_xQ5aGXbadJyajpJQyfgxx4YPSj62VlmmGDMnC-1E,41819
-optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=dyl8tDBjfe5VfU1XbKAoZS7g7F90JTYVmMuz0HTmCoE,35345
+optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=DylKxV1kFbDv34txpuI5JrvMcSTa2W910eO9dmF0o_8,35352
 optimum/rbln/transformers/models/dpt/__init__.py,sha256=gP1tkR3XMNlHq1GT87ugIVvb2o_1eAUg1JaniXjy1Lw,651
 optimum/rbln/transformers/models/dpt/modeling_dpt.py,sha256=ZsS2SOiqcA4azULB-WFEMQZbgIoOyVUKqVKqrw_tWzA,3430
 optimum/rbln/transformers/models/exaone/__init__.py,sha256=zYH_5tVa8-juEdsOIky7I33WSC3Zuhoq1upI0OHYeVw,859
@@ -100,7 +100,7 @@ optimum/rbln/transformers/models/wav2vec2/modeling_wav2vec2.py,sha256=JYJmV52j6c
 optimum/rbln/transformers/models/whisper/__init__.py,sha256=ktnNe5ri3ycCWZ_W_voFB9y9-vgGgxS1X9s8LBRZmWc,665
 optimum/rbln/transformers/models/whisper/generation_whisper.py,sha256=GIHTca3b1VtW81kp7BzKQ7f77c2t9OsEsbZetripgDo,4582
 optimum/rbln/transformers/models/whisper/modeling_whisper.py,sha256=0nBADNxE0A1ozBbRutTBvxpo_Y1qkOycT_zronkN-ZU,15840
-optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=eP3UgkwCRaaFjc5Jc4ZEiWxr3-L7oJx9KzpJ7eFkwUs,13158
+optimum/rbln/transformers/models/whisper/whisper_architecture.py,sha256=_6PmE4-DD5QhohQwHW5M11q_L9f_ayF6StmNTlOYJdg,12896
 optimum/rbln/transformers/models/xlm_roberta/__init__.py,sha256=fC7iNcdxBZ_6eOF2snStmf8r2M3c8O_-XcXnQEaHQCE,653
 optimum/rbln/transformers/models/xlm_roberta/modeling_xlm_roberta.py,sha256=8YNLz0bc5ze-QuU8rN-QhUfGzlSUs3iMJiWTxO3o6AM,4366
 optimum/rbln/transformers/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -114,7 +114,7 @@ optimum/rbln/utils/model_utils.py,sha256=DfD_Z2qvZHqcddXqnzTM1AN8khanj3-DXK2lJvV
 optimum/rbln/utils/runtime_utils.py,sha256=5-DYniyP59nx-mrrbi7AqA77L85b4Cm5oLpaxidSyss,3699
 optimum/rbln/utils/save_utils.py,sha256=hG5uOtYmecSXZuGTvCXsTM-SiyZpr5q3InUGCCq_jzQ,3619
 optimum/rbln/utils/submodule.py,sha256=oZoGrItB8WqY4i-K9WJPlLlcLohc1YGB9OHB8_XZw3A,4071
-optimum_rbln-0.7.3a2.dist-info/METADATA,sha256=C-IWumO-veJFZPHpF8wcOTOE0TCDzKU1Xk_ylaqrvPM,5300
-optimum_rbln-0.7.3a2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-optimum_rbln-0.7.3a2.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
-optimum_rbln-0.7.3a2.dist-info/RECORD,,
+optimum_rbln-0.7.3a3.dist-info/METADATA,sha256=UQs6c3GdXbPYE8wSnT6Ca9TtgfKwEgPNVZk-MoAKQPc,5300
+optimum_rbln-0.7.3a3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+optimum_rbln-0.7.3a3.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
+optimum_rbln-0.7.3a3.dist-info/RECORD,,

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a3.dist-info}/WHEEL RENAMED Viewed

File without changes

{optimum_rbln-0.7.3a2.dist-info → optimum_rbln-0.7.3a3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

optimum-rbln 0.7.3a2__py3-none-any.whl → 0.7.3a3__py3-none-any.whl

optimum-rbln 0.7.3a2py3-none-any.whl → 0.7.3a3py3-none-any.whl