PyPI - optimum-rbln - Versions diffs - 0.8.2a6__tar.gz → 0.8.2rc0__tar.gz - Mend

optimum-rbln 0.8.2a6tar.gz → 0.8.2rc0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (300) hide show

optimum_rbln-0.8.2rc0/.github/version.yaml ADDED Viewed

	@@ -0,0 +1 @@
1	+ rebel_compiler_version: 0.8.2.dev187+g9f5b6c9b

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.8.2a6
+Version: 0.8.2rc0
 Summary: Optimum RBLN is the interface between the HuggingFace Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/__init__.py RENAMED Viewed

@@ -72,6 +72,8 @@ _import_structure = {
         "RBLNCLIPVisionModelWithProjectionConfig",
         "RBLNColPaliForRetrieval",
         "RBLNColPaliForRetrievalConfig",
+        "RBLNDecoderOnlyModelConfig",
+        "RBLNDecoderOnlyModel",
         "RBLNDecoderOnlyModelForCausalLM",
         "RBLNDecoderOnlyModelForCausalLMConfig",
         "RBLNDistilBertForQuestionAnswering",
@@ -345,6 +347,8 @@ if TYPE_CHECKING:
         RBLNCLIPVisionModelWithProjectionConfig,
         RBLNColPaliForRetrieval,
         RBLNColPaliForRetrievalConfig,
+        RBLNDecoderOnlyModel,
+        RBLNDecoderOnlyModelConfig,
         RBLNDecoderOnlyModelForCausalLM,
         RBLNDecoderOnlyModelForCausalLMConfig,
         RBLNDistilBertForQuestionAnswering,

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/__version__.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.8.2a6'
-__version_tuple__ = version_tuple = (0, 8, 2, 'a6')
+__version__ = version = '0.8.2rc0'
+__version_tuple__ = version_tuple = (0, 8, 2, 'rc0')

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/__init__.py RENAMED Viewed

@@ -62,6 +62,8 @@ _import_structure = {
         "RBLNCLIPVisionModelWithProjectionConfig",
         "RBLNDecoderOnlyModelForCausalLM",
         "RBLNDecoderOnlyModelForCausalLMConfig",
+        "RBLNDecoderOnlyModelConfig",
+        "RBLNDecoderOnlyModel",
         "RBLNDistilBertForQuestionAnswering",
         "RBLNDistilBertForQuestionAnsweringConfig",
         "RBLNDPTForDepthEstimation",
@@ -196,6 +198,8 @@ if TYPE_CHECKING:
         RBLNCLIPVisionModelWithProjectionConfig,
         RBLNColPaliForRetrieval,
         RBLNColPaliForRetrievalConfig,
+        RBLNDecoderOnlyModel,
+        RBLNDecoderOnlyModelConfig,
         RBLNDecoderOnlyModelForCausalLM,
         RBLNDecoderOnlyModelForCausalLMConfig,
         RBLNDistilBertForQuestionAnswering,

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/models/__init__.py RENAMED Viewed

@@ -84,6 +84,8 @@ _import_structure = {
         "RBLNQwen2_5_VLForConditionalGenerationConfig",
     ],
     "decoderonly": [
+        "RBLNDecoderOnlyModelConfig",
+        "RBLNDecoderOnlyModel",
         "RBLNDecoderOnlyModelForCausalLM",
         "RBLNDecoderOnlyModelForCausalLMConfig",
     ],
@@ -216,6 +218,8 @@ if TYPE_CHECKING:
         RBLNColPaliForRetrievalConfig,
     )
     from .decoderonly import (
+        RBLNDecoderOnlyModel,
+        RBLNDecoderOnlyModelConfig,
         RBLNDecoderOnlyModelForCausalLM,
         RBLNDecoderOnlyModelForCausalLMConfig,
     )

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py RENAMED Viewed

@@ -1025,8 +1025,7 @@ class SlidingWindowAttentionOp(AttentionOp):
         }
         if self.phase == "prefill" or self.phase == "image_prefill":
-            if not self.use_attention_mask or self.use_position_ids:
-                op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+            op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py RENAMED Viewed

@@ -1403,7 +1403,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel):
             )
             padded_cache_lengths = torch.zeros_like(generate_idx)
-        # Prefll
+        # Prefill
         if cache_position is None:
             logits = []
             inputs = inputs_embeds if inputs_embeds is not None else input_ids

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/models/gemma3/configuration_gemma3.py RENAMED Viewed

@@ -23,14 +23,17 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
         self,
         use_position_ids: Optional[bool] = None,
         use_attention_mask: Optional[bool] = None,
+        prefill_chunk_size: Optional[int] = None,
         image_prefill_chunk_size: Optional[int] = None,
         **kwargs: Dict[str, Any],
     ):
         # use_attention_mask and use_position_ids are always True for Gemma3
         use_attention_mask = use_attention_mask or True
         use_position_ids = use_position_ids or True
+        prefill_chunk_size = prefill_chunk_size or 256
         super().__init__(
+            prefill_chunk_size=prefill_chunk_size,
             use_attention_mask=use_attention_mask,
             use_position_ids=use_position_ids,
             **kwargs,
@@ -73,3 +76,11 @@ class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):
         self.vision_tower = self.init_submodule_config(RBLNSiglipVisionModelConfig, vision_tower)
         self.language_model = self.init_submodule_config(RBLNGemma3ForCausalLMConfig, language_model)
+    @property
+    def image_prefill_chunk_size(self):
+        return self.language_model.image_prefill_chunk_size
+    @property
+    def prefill_chunk_size(self):
+        return self.language_model.prefill_chunk_size

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/models/gemma3/modeling_gemma3.py RENAMED Viewed

@@ -258,17 +258,45 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
         return inputs_embeds
+    def get_padded_cache_position(
+        self,
+        cache_position: torch.Tensor,  # shape: [1, seq_len]
+        token_type_ids: torch.Tensor,  # shape: [1, seq_len]
+    ) -> torch.Tensor:
+        seq_len = cache_position[0][-1].item() + 1
+        # Find image start positions
+        image_starts = [
+            s
+            for s in torch.where(token_type_ids == 1)[1]
+            if torch.all(token_type_ids[:, s : s + self.rbln_config.image_prefill_chunk_size] == 1)
+        ]
+        # Initialize padded tensors
+        padded_input_len = seq_len
+        for image_start in image_starts:
+            pad_needed = (
+                self.rbln_config.image_prefill_chunk_size
+                - (image_start + padded_input_len - seq_len) % self.rbln_config.image_prefill_chunk_size
+            ) % self.rbln_config.image_prefill_chunk_size
+            padded_input_len += pad_needed
+        return torch.cat(
+            [cache_position, torch.arange(seq_len, padded_input_len, dtype=torch.int32).unsqueeze(0)],
+            dim=1,
+        )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor = None,
+        token_type_ids: torch.Tensor = None,
         pixel_values: torch.FloatTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         generate_idx: Optional[torch.Tensor] = None,
         padded_cache_lengths: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
         **lm_kwargs: Dict[str, Any],
     ) -> Union[Tuple, RBLNDecoderOnlyForCausalLMOutput]:
         # prefill
@@ -279,12 +307,15 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
             for b_idx in range(batch_size):
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
+                token_type_id = token_type_ids[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
+                cache_position = self.get_padded_cache_position(cache_position, token_type_id)
                 output = self.language_model.prefill_decoder(
                     inputs_embeds=inputs_embeds[b_idx : b_idx + 1],
                     attention_mask=attention_mask[b_idx],
                     cache_position=cache_position,
                     batch_idx=b_idx,
-                    token_type_ids=token_type_ids[b_idx : b_idx + 1] if token_type_ids is not None else None,
+                    token_type_ids=token_type_ids[b_idx : b_idx + 1],  # do not pass token_type_id
                 )
                 padded_cache_lengths[b_idx] += output.padded_cache_lengths
                 logits.append(output.logits)
@@ -336,9 +367,10 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
         # chunked_attention_mask shape
         chunked_attention_mask = torch.zeros(1, chunked_attention_mask.shape[-1], dtype=torch.float32)
-        # as gemma3 has different prefill chunk size for image and text, we need to pad the inputs to the max of the two.
+        # In case of Gemma3ForConditionalGeneration, the loop counter may not be a prefill_chunk_size,
+        # so we cannot guarantee that the last chunk starts at a position that is a multiple of prefill_chunk_size.
         if self.rbln_config.use_image_prefill:
-            padding_size = max(self.rbln_config.prefill_chunk_size, self.rbln_config.image_prefill_chunk_size)
+            padding_size = self.rbln_config.image_prefill_chunk_size
             inputs = torch.nn.functional.pad(inputs, (0, 0, 0, padding_size))
             cache_position = torch.nn.functional.pad(cache_position, (0, padding_size))
             position_ids = torch.nn.functional.pad(position_ids, (0, padding_size))
@@ -389,45 +421,39 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
         step = 0
         while step < query_length:
-            # Check if the prefill chunk is an image prefill
-            is_image_prefill = self.rbln_config.use_image_prefill and torch.all(
-                token_type_ids[:, step : step + self.rbln_config.image_prefill_chunk_size] == 1
-            )
-            prefill_chunk_size = (
-                self.rbln_config.image_prefill_chunk_size if is_image_prefill else self.rbln_config.prefill_chunk_size
-            )
-            # Check if the prefill chunk is a text prefill which have image_tokens in it.
-            is_text_prefill_with_image_tokens = (
-                self.rbln_config.use_image_prefill
-                and not is_image_prefill
-                and torch.any(token_type_ids[:, step : step + prefill_chunk_size] == 1)
-            )
-            # Check if the prefill chunk crosses a block boundary, requiring padding to align with block boundaries
-            is_cross_block_boundary = (
-                step // self.rbln_config.kvcache_block_size
-                != (step + prefill_chunk_size) // self.rbln_config.kvcache_block_size
-            )
+            if self.rbln_config.use_image_prefill:
+                # Check if the prefill chunk is an image prefill
+                is_image_prefill = torch.all(
+                    token_type_ids[:, step : step + self.rbln_config.image_prefill_chunk_size] == 1
+                )
+                # Check if the prefill chunk is a text prefill which have image_tokens in it.
+                is_text_prefill_with_image_tokens = not is_image_prefill and torch.any(
+                    token_type_ids[:, step : step + self.rbln_config.prefill_chunk_size] == 1
+                )
+            else:
+                is_image_prefill, is_text_prefill_with_image_tokens = False, False
             # Check if the prefill chunk is the last chunk
-            is_last_chunk = step + prefill_chunk_size >= query_length
+            is_last_chunk = step + self.rbln_config.prefill_chunk_size >= query_length
-            if is_cross_block_boundary:
-                padding_size = prefill_chunk_size - (step + prefill_chunk_size) % self.rbln_config.kvcache_block_size
-                padded_cache_lengths += padding_size
+            input_chunk = inputs[:, step : step + self.rbln_config.prefill_chunk_size]
+            cache_pos_chunk = (
+                cache_position[:, step : step + self.rbln_config.prefill_chunk_size] + padded_cache_lengths
+            )
+            position_ids_chunk = position_ids[:, step : step + self.rbln_config.prefill_chunk_size]
             # if text_prefill end with image_tokens, we only treat the text part.
-            num_processed_tokens = prefill_chunk_size
+            num_processed_tokens = self.rbln_config.prefill_chunk_size
+            current_padded_cache_lengths = 0
             if is_text_prefill_with_image_tokens:
-                first_image_token_idx = torch.where(token_type_ids[:, step : step + prefill_chunk_size] == 1)[1][0]
+                first_image_token_idx = torch.where(
+                    token_type_ids[:, step : step + self.rbln_config.prefill_chunk_size] == 1
+                )[1][0]
                 num_processed_tokens = first_image_token_idx.item()
+                current_padded_cache_lengths = self.rbln_config.prefill_chunk_size - num_processed_tokens
             if is_last_chunk:
                 num_processed_tokens = query_length - step
-            input_chunk = inputs[:, step : step + prefill_chunk_size]
-            cache_pos_chunk = cache_position[:, step : step + prefill_chunk_size].clone() + padded_cache_lengths
-            position_ids_chunk = position_ids[:, step : step + prefill_chunk_size].clone()
             chunked_attention_mask[
                 :, step + padded_cache_lengths : step + num_processed_tokens + padded_cache_lengths
             ] = 1
@@ -456,6 +482,7 @@ class RBLNGemma3RuntimeModel(RBLNRuntimeModel):
                     out=out_buffers,
                 )
+            padded_cache_lengths += current_padded_cache_lengths
             step += num_processed_tokens
         if not is_external_block_tables:
@@ -633,6 +660,11 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
             raise ValueError("use_attention_mask and use_position_ids must be True for RBLNGemma3ForCausalLM")
         if rbln_config.use_image_prefill:
+            if rbln_config.prefill_chunk_size != rbln_config.image_prefill_chunk_size:
+                raise NotImplementedError(
+                    "Not implemented for different prefill chunk sizes between text and image prefill."
+                )
             # Update image prefill compile config
             img_prefill_input_info = cls.get_input_info(
                 batch_size=1,

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/src/optimum/rbln/transformers/models/llava/modeling_llava.py RENAMED Viewed

@@ -46,10 +46,7 @@ class LoopVisionTower:
     def __init__(self, vision_tower: RBLNModel) -> None:
         self.vision_tower = vision_tower
-    def forward(self, *args, **kwargs):
-        pixel_values = args[0]
-        image_sizes = kwargs.pop("image_sizes", None)
+    def forward(self, pixel_values, image_sizes: Optional[torch.Tensor] = None, **kwargs):
         outputs = []
         for i in range(pixel_values.shape[0]):
             outputs.append(
@@ -161,6 +158,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
         model_config: Optional["PretrainedConfig"] = None,
         rbln_config: Optional[RBLNModelConfig] = None,
     ) -> RBLNModelConfig:
+        # support for pixtral that needs padding
         if hasattr(rbln_config.vision_tower, "max_image_size"):
             num_positions = (
                 rbln_config.vision_tower.batch_size
@@ -171,7 +169,10 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
         else:
             num_positions = (model_config.vision_config.image_size // model_config.vision_config.patch_size) ** 2 + 1
-            selected_image_feature_dim = num_positions - 1
+            if model_config.vision_feature_select_strategy == "default":
+                selected_image_feature_dim = num_positions - 1
+            else:
+                selected_image_feature_dim = num_positions
         input_info = [
             (

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/tests/test_base.py RENAMED Viewed

@@ -200,8 +200,13 @@ class BaseTest:
             if self.EXPECTED_OUTPUT:
                 from simphile import jaccard_similarity
-                similarity = jaccard_similarity(output, self.EXPECTED_OUTPUT)
-                self.assertGreater(similarity, 0.9)
+                if isinstance(self.EXPECTED_OUTPUT, str):
+                    similarity = jaccard_similarity(output, self.EXPECTED_OUTPUT)
+                    self.assertGreater(similarity, 0.9)
+                else:
+                    for o, e_o in zip(output, self.EXPECTED_OUTPUT):
+                        similarity = jaccard_similarity(o, e_o)
+                        self.assertGreater(similarity, 0.9)
         def _inner_test_save_load(self, tmpdir):
             with ContextRblnConfig(create_runtimes=False):

{optimum_rbln-0.8.2a6 → optimum_rbln-0.8.2rc0}/tests/test_llm.py RENAMED Viewed

@@ -91,9 +91,7 @@ class LLMTest:
 class TestMistralForCausalLM(LLMTest.TestLLM):
     RBLN_CLASS = RBLNMistralForCausalLM
     HF_MODEL_ID = "openaccess-ai-collective/tiny-mistral"
-    EXPECTED_OUTPUT = (
-        "Edge wat ComecidBusDonald=-Battle Orts html тиційsprintfвата              Orts sect matches terrible occup"
-    )
+    EXPECTED_OUTPUT = "watasurescid completionennen Brad completion жеULT ba completion影 Fin сво Regimentixon cabin影 provisions bland"
     HF_CONFIG_KWARGS = {"num_hidden_layers": 1, "max_position_embeddings": 1024, "sliding_window": 512}