PyPI - optimum-rbln - Versions diffs - 0.7.5a0__py3-none-any.whl → 0.7.5rc0__py3-none-any.whl - Mend

optimum-rbln 0.7.5a0py3-none-any.whl → 0.7.5rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -30,7 +30,7 @@ from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ...utils.rbln_quantization import QuantizationManager
+from ...utils.rbln_quantization import prepare_model_for_quantization
 from .configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
 from .decoderonly_architecture import (
     DecoderOnlyWrapper,
@@ -59,6 +59,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         kvcache_block_size: int,
         use_attention_mask: bool,
         attn_impl: str,
+        use_position_ids: bool,
         **kwargs: Any,
     ) -> None:
         super().__init__(runtime, **kwargs)
@@ -72,6 +73,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         self.dec_attn_mask = dec_attn_mask
         self.block_tables = block_tables
         self.free_block_pool = free_block_pool
+        self.use_position_ids = use_position_ids
         self.kvcache_block_size = kvcache_block_size
         self.empty_block = -1
@@ -164,6 +166,9 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         batch_idx: Optional[int] = None,
         block_tables: Optional[torch.Tensor] = None,
         position_embed: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        local_block_tables: Optional[torch.Tensor] = None,
     ):
         if input_ids is None and inputs_embeds is None:
             raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
@@ -189,10 +194,19 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 is_external_block_tables,
                 attention_mask=attention_mask,
                 position_embed=position_embed,
+                position_ids=position_ids,
+                local_block_tables=local_block_tables,
             )
         else:
             return self.prefill_forward(
-                inputs, cache_position, attention_mask, batch_idx, block_tables, position_embed=position_embed
+                inputs,
+                cache_position,
+                attention_mask,
+                batch_idx,
+                block_tables,
+                position_embed=position_embed,
+                token_type_ids=token_type_ids,
+                local_block_tables=local_block_tables,
             )
     def decode_forward(
@@ -203,6 +217,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         is_external_block_tables: bool = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_embed: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        local_block_tables: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         batch_size = inputs.shape[0]
         if batch_size != self.batch_size:
@@ -232,35 +248,32 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         if self.batch_size < block_tables.shape[0]:
             block_tables = block_tables[: self.batch_size]
-        if self.batch_size < attention_mask.shape[0]:
+        if attention_mask is not None and self.batch_size < attention_mask.shape[0]:
             attention_mask = attention_mask[: self.batch_size]
         logits = super().forward(
             inputs,
             cache_position,
-            attention_mask if self.use_attention_mask else None,
             block_tables,
             position_embed,
+            attention_mask if self.use_attention_mask else None,
+            position_ids if self.use_position_ids else None,
         )
-        return logits
+        return RBLNDecoderOnlyOutput(logits=logits)
-    def prefill_forward(
+    def _prepare_prefill_inputs(
         self,
         inputs: torch.Tensor,
-        cache_position: torch.Tensor = None,
+        cache_position: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        batch_idx: int = None,
-        block_tables: torch.Tensor = None,
-        is_external_block_tables: bool = None,
         position_embed: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
+        local_block_tables: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ):
         """
-        Performs chunked prefill for efficient KV-cache updates and memory optimization.
-        Instead of processing the entire sequence at once, the input is divided into chunks of size `prefill_chunk_size`,
-        and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
+        Prepare inputs for prefill phase.
         """
         # Handle continuous batching in a compiled graph by extracting valid inputs
         # If an attention mask is provided, select only the valid (non-masked) inputs
         inputs = inputs[:, attention_mask.bool()] if attention_mask is not None else inputs
@@ -276,8 +289,11 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             )
         # Initialize attention mask for chunked processing
-        if self.use_attention_mask:
-            chunked_attention_mask = torch.zeros(1, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.float32)
+        chunked_attention_mask = (
+            torch.zeros(1, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.float32)
+            if self.use_attention_mask
+            else None
+        )
         # Buffer for storing output logits
         out_buffers = [
@@ -288,40 +304,88 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             )
         ]
-        # Process input in chunks of size `prefill_chunk_size`
-        for step in range(0, query_length, self.prefill_chunk_size):
-            # Pad input and cache_position if the last chunk is smaller than `prefill_chunk_size`
-            if (step + self.prefill_chunk_size) > query_length:
-                padding_size = step + self.prefill_chunk_size - query_length
-                # inputs_embeds
-                if inputs.dim() == 3:
-                    inputs = torch.nn.functional.pad(inputs, (0, 0, 0, padding_size))
-                # inputs_ids
-                else:
-                    inputs = torch.nn.functional.pad(inputs, (0, padding_size))
+        # Pad input and cache_position if the last chunk is smaller than `prefill_chunk_size`
+        if query_length % self.prefill_chunk_size != 0:
+            padding_size = self.prefill_chunk_size - query_length % self.prefill_chunk_size
+            # inputs_embeds
+            if inputs.dim() == 3:
+                inputs = torch.nn.functional.pad(inputs, (0, 0, 0, padding_size))
+            # inputs_ids
+            else:
+                inputs = torch.nn.functional.pad(inputs, (0, padding_size))
-                cache_position = torch.cat(
-                    [
-                        cache_position,
-                        torch.arange(
-                            query_length,
-                            step + self.prefill_chunk_size,
-                            dtype=torch.int32,
-                        ).unsqueeze(0),
-                    ],
-                    dim=-1,
-                )
+            cache_position = torch.cat(
+                [
+                    cache_position,
+                    torch.arange(
+                        query_length,
+                        query_length + padding_size,
+                        dtype=torch.int32,
+                    ).unsqueeze(0),
+                ],
+                dim=-1,
+            )
+            if position_embed is not None:
+                position_embed = torch.nn.functional.pad(position_embed, (0, 0, 0, padding_size))
+        # Overwrite position_ids and padded_cache_lengths
+        position_ids = None
+        padded_cache_lengths = 0
+        return (
+            inputs,
+            cache_position,
+            chunked_attention_mask,
+            out_buffers,
+            position_ids,
+            position_embed,
+            padded_cache_lengths,
+            query_length,
+        )
-                if position_embed is not None:
-                    position_embed = torch.nn.functional.pad(position_embed, (0, 0, 0, padding_size))
+    def prefill_forward(
+        self,
+        inputs: torch.Tensor,
+        cache_position: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        batch_idx: int = None,
+        block_tables: torch.Tensor = None,
+        is_external_block_tables: bool = None,
+        position_embed: Optional[torch.Tensor] = None,
+        local_block_tables: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Performs chunked prefill for efficient KV-cache updates and memory optimization.
+        Instead of processing the entire sequence at once, the input is divided into chunks of size `prefill_chunk_size`,
+        and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
+        """
+        (
+            inputs,
+            cache_position,
+            chunked_attention_mask,
+            out_buffers,
+            position_ids,
+            position_embed,
+            padded_cache_lengths,
+            query_length,
+        ) = self._prepare_prefill_inputs(
+            inputs, cache_position, attention_mask, position_embed, token_type_ids=token_type_ids
+        )
+        # Process input in chunks of size `prefill_chunk_size`
+        for step in range(0, query_length, self.prefill_chunk_size):
             # Extract the current chunk of inputs and cache positions
             input_chunk = inputs[:, step : step + self.prefill_chunk_size]
             cache_pos_chunk = cache_position[:, step : step + self.prefill_chunk_size]
+            position_ids_chunk = (
+                position_ids[:, step : step + self.prefill_chunk_size] if position_ids is not None else None
+            )
             if position_embed is not None:
                 position_embed_chunk = position_embed[:, :, :, step : step + self.prefill_chunk_size, :]
-            if self.use_attention_mask:
+            if self.use_attention_mask and not self.use_position_ids:
                 # Update attention mask to ensure proper causal behavior
                 if step >= self.prefill_chunk_size:
                     chunked_attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
@@ -334,10 +398,11 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             logits = super().forward(
                 input_chunk,
                 cache_pos_chunk,
-                chunked_attention_mask if self.use_attention_mask else None,
-                query_position,
                 block_tables,
                 position_embed_chunk if position_embed is not None else None,
+                query_position,
+                chunked_attention_mask if self.use_attention_mask else None,
+                position_ids_chunk if self.use_position_ids else None,
                 out=out_buffers,
             )
@@ -346,13 +411,14 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             self.dec_attn_mask[batch_idx].fill_(0)
             self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
-        return logits
+        return RBLNDecoderOnlyOutput(logits=logits, padded_cache_lengths=padded_cache_lengths)
 @dataclass
 class RBLNDecoderOnlyOutput(ModelOutput):
     logits: torch.FloatTensor = None
     generate_idx: torch.Tensor = None
+    padded_cache_lengths: int = None
 class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
@@ -386,12 +452,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         if self.rbln_config.use_inputs_embeds:
             main_input_name = "inputs_embeds"
             artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
-            with no_init_weights():
-                self.embed_tokens = torch.nn.Embedding(
-                    self.config.vocab_size,
-                    self.config.hidden_size,
-                    self.config.pad_token_id,
-                )
+            self.embed_tokens = self._create_embedding_layer()
             self.embed_tokens.load_state_dict(artifacts["embed_tokens"])
         else:
             self.embed_tokens = None
@@ -422,7 +483,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             max_seq_len=self.rbln_config.max_seq_len,
             use_attention_mask=self.rbln_config.use_attention_mask,
             attn_impl=self.rbln_config.attn_impl,
+            use_position_ids=self.rbln_config.use_position_ids,
         )
         self.decoders = {}
         for i, batch_size in enumerate(self.rbln_config.decoder_batch_sizes):
             self.decoders[batch_size] = RBLNRuntimeModel(
@@ -437,6 +500,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 kvcache_block_size=self.rbln_config.kvcache_block_size,
                 use_attention_mask=self.rbln_config.use_attention_mask,
                 attn_impl=self.rbln_config.attn_impl,
+                use_position_ids=self.rbln_config.use_position_ids,
             )
         # NOTE(eunji): Use a decoder whose batch size matches the model's main batch size for compatibility.
@@ -459,6 +523,15 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             save_dict["embed_tokens"] = model.get_input_embeddings().state_dict()
             torch.save(save_dict, save_dir_path / subfolder / "torch_artifacts.pth")
+    def _create_embedding_layer(self):
+        with no_init_weights():
+            embed_tokens = torch.nn.Embedding(
+                self.config.vocab_size,
+                self.config.hidden_size,
+                self.config.pad_token_id,
+            )
+        return embed_tokens
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -482,8 +555,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         trust_remote_code: bool = False,
         **kwargs,
     ):
-        from ...utils.rbln_quantization import prepare_model_for_quantization
         kwargs = cls.update_kwargs(kwargs)
         if config is None:
@@ -500,8 +571,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         with no_init_weights():
             model = AutoModelForCausalLM.from_config(config)
-        prepare_model_for_quantization(model, model_id, kwargs.get("num_hidden_layers"))
+        model = prepare_model_for_quantization(
+            model,
+            model_id,
+            kwargs.get("num_hidden_layers"),
+            use_auth_token=use_auth_token,
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            local_files_only=local_files_only,
+        )
         return model
     def __getattr__(self, __name: str) -> Any:
@@ -528,11 +607,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     def get_pytorch_model(
         cls, *args, rbln_config: Optional[RBLNDecoderOnlyModelForCausalLMConfig] = None, **kwargs
     ) -> "PreTrainedModel":
-        if (
-            rbln_config is not None
-            and "format" in rbln_config.quantization
-            and rbln_config.quantization["format"] == "rbln"
-        ):
+        if rbln_config and rbln_config.quantization:
             model = cls.get_quantized_model(*args, **kwargs)
         else:
             model = super().get_pytorch_model(*args, **kwargs)
@@ -548,6 +623,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             "kvcache_block_size": rbln_config.kvcache_block_size,
             "use_rotary_emb": cls._use_rotary_emb,
             "use_attention_mask": rbln_config.use_attention_mask,
+            "use_position_ids": rbln_config.use_position_ids,
+            "use_inputs_embeds": rbln_config.use_inputs_embeds,
         }
         return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
@@ -572,9 +649,10 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 static_tensors[name] = tensor
                 context.mark_static_address(tensor)
-        @QuantizationManager.with_quantization_env
-        def compile_model(wrapped_model, compile_config, example_inputs, compile_context, **kwargs):
+        def compile_model(wrapped_model, compile_config, example_inputs, compile_context, quantization):
             try:
+                if quantization:
+                    quantization.maybe_set_quantization_env()
                 original_linear = torch.nn.functional.linear
                 torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
                 compiled_model = RBLNModel.compile(
@@ -586,14 +664,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 return compiled_model
             finally:
                 torch.nn.functional.linear = original_linear
+                if quantization:
+                    quantization.maybe_reset_quantization_env()
         wrapped_model.phase = "prefill"
         compiled_prefill = compile_model(
-            wrapped_model,
-            prefill_compile_config,
-            prefill_example_inputs,
-            context,
-            quantize_config=rbln_config.quantization,
+            wrapped_model, prefill_compile_config, prefill_example_inputs, context, rbln_config.quantization
         )
         wrapped_model.phase = "decode"
@@ -601,11 +677,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_compile_configs[1:]):
             dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
             compiled_decoder = compile_model(
-                wrapped_model,
-                dec_compile_config,
-                dec_example_inputs,
-                context,
-                quantize_config=rbln_config.quantization,
+                wrapped_model, dec_compile_config, dec_example_inputs, context, rbln_config.quantization
             )
             compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
@@ -763,6 +835,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         query_length: int,
         use_inputs_embeds: bool,
         use_attention_mask: bool,
+        use_position_ids: bool,
         max_seq_len: int,
         kvcache_block_size: int,
         kvcache_num_blocks: int,
@@ -785,26 +858,27 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             ),
         ]
-        if use_attention_mask:
+        max_block_cnt = max_seq_len // kvcache_block_size
+        if query_length > 1:
+            input_info.extend([("block_tables", [max_block_cnt], "int16")])
+        else:
+            input_info.extend([("block_tables", [batch_size, max_block_cnt], "int16")])
+        if query_length > 1:
             input_info.extend(
                 [
-                    ("attention_mask", [batch_size, 1, query_length, max_seq_len], "float32"),
+                    ("query_position", [], "int16"),
                 ]
             )
-        if query_length > 1:
+        if use_attention_mask:
             input_info.extend(
                 [
-                    ("query_position", [], "int16"),
+                    ("attention_mask", [batch_size, 1, query_length, max_seq_len], "float32"),
                 ]
             )
-        max_block_cnt = max_seq_len // kvcache_block_size
-        if query_length > 1:
-            input_info.extend([("block_tables", [max_block_cnt], "int16")])
-        else:
-            input_info.extend([("block_tables", [batch_size, max_block_cnt], "int16")])
+        if use_position_ids:
+            input_info.append(("position_ids", [batch_size, query_length], "int32"))
         input_info.extend(
             [
@@ -898,6 +972,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             query_length=rbln_config.prefill_chunk_size,
             use_inputs_embeds=rbln_config.use_inputs_embeds,
             use_attention_mask=rbln_config.use_attention_mask,
+            use_position_ids=rbln_config.use_position_ids,
             max_seq_len=rbln_config.max_seq_len,
             kvcache_block_size=rbln_config.kvcache_block_size,
             kvcache_num_blocks=rbln_config.kvcache_num_blocks,
@@ -916,6 +991,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 query_length=1,
                 use_inputs_embeds=rbln_config.use_inputs_embeds,
                 use_attention_mask=rbln_config.use_attention_mask,
+                use_position_ids=rbln_config.use_position_ids,
                 max_seq_len=rbln_config.max_seq_len,
                 kvcache_block_size=rbln_config.kvcache_block_size,
                 kvcache_num_blocks=rbln_config.kvcache_num_blocks,
@@ -977,6 +1053,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         generate_idx: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        padded_cache_lengths: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         model_inputs = {}
@@ -984,13 +1061,17 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         if is_prefill_phase:
             generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
+            padded_cache_lengths = torch.zeros_like(generate_idx)
             cache_position = None
+            position_ids = None
         else:
             if inputs_embeds is not None:
-                raise NotImplementedError("Specifying inputs_embeds in decoder phase is not supported.")
+                # if `inputs_embeds` are passed, only use them in the 1st generation step for every prompt.
+                inputs_embeds = None
             input_ids = input_ids[:, -1:]
-            cache_position = generate_idx
+            position_ids = generate_idx
+            cache_position = generate_idx + padded_cache_lengths if padded_cache_lengths is not None else generate_idx
             generate_idx = generate_idx + 1
             model_inputs.update({"input_ids": input_ids})
@@ -1009,6 +1090,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 "attention_mask": attention_mask,
                 "cache_position": cache_position,
                 "generate_idx": generate_idx,
+                "position_ids": position_ids,
+                "padded_cache_lengths": padded_cache_lengths,
             }
         )
@@ -1022,6 +1105,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     ) -> Dict[str, Any]:
         # update generate_idx
         model_kwargs["generate_idx"] = outputs.generate_idx
+        model_kwargs["padded_cache_lengths"] = outputs.padded_cache_lengths
         return model_kwargs
@@ -1032,6 +1116,10 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         cache_position: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         generate_idx: Optional[torch.Tensor] = None,
+        padded_cache_lengths: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        return_dict: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
         """
@@ -1045,18 +1133,18 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             logits = []
             inputs = inputs_embeds if inputs_embeds is not None else input_ids
             batch_size = inputs.shape[0]
             for b_idx in range(batch_size):
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
-                logit = self.prefill_decoder(
+                output = self.prefill_decoder(
                     input_ids=inputs[b_idx : b_idx + 1] if inputs_embeds is None else None,
                     inputs_embeds=inputs[b_idx : b_idx + 1] if inputs_embeds is not None else None,
                     attention_mask=attention_mask[b_idx] if attention_mask is not None else None,
                     cache_position=cache_position,
                     batch_idx=b_idx,
+                    token_type_ids=token_type_ids[b_idx : b_idx + 1] if token_type_ids is not None else None,
                 )
-                logits.append(logit)
+                padded_cache_lengths[b_idx] += output.padded_cache_lengths
+                logits.append(output.logits)
             logits = torch.cat(logits, dim=0)
         # Decoder
         else:
@@ -1072,9 +1160,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
                 cache_position=cache_position,
-            )
+                position_ids=position_ids if self.rbln_config.use_position_ids else None,
+            ).logits
-        return RBLNDecoderOnlyOutput(
-            logits=logits,
-            generate_idx=generate_idx,
-        )
+        if not return_dict:
+            return logits, generate_idx, padded_cache_lengths
+        else:
+            return RBLNDecoderOnlyOutput(
+                logits=logits, generate_idx=generate_idx, padded_cache_lengths=padded_cache_lengths
+            )

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -41,7 +41,10 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
         for layer in causal_lm.transformer.h:
             if self.attn_impl == "eager":
                 new_self_attn = ExaoneAttention(
-                    layer.attn.attention, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                    layer.attn.attention,
+                    self.use_attention_mask,
+                    kvcache_block_size=self.kvcache_block_size,
+                    use_position_ids=self.use_position_ids,
                 )
             elif self.attn_impl == "flash_attn":
                 new_self_attn = ExaoneFlashAttention(
@@ -49,6 +52,7 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
                     kvcache_partition_len=self.kvcache_partition_len,
                     use_attention_mask=self.use_attention_mask,
                     kvcache_block_size=self.kvcache_block_size,
+                    use_position_ids=self.use_position_ids,
                 )
             else:
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -34,7 +34,10 @@ class GemmaWrapper(DecoderOnlyWrapper):
         for layer in causal_lm.model.layers:
             if self.attn_impl == "eager":
                 new_self_attn = DecoderOnlyAttention(
-                    layer.self_attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                    layer.self_attn,
+                    self.use_attention_mask,
+                    kvcache_block_size=self.kvcache_block_size,
+                    use_position_ids=self.use_position_ids,
                 )
             elif self.attn_impl == "flash_attn":
                 new_self_attn = DecoderOnlyFlashAttention(
@@ -42,6 +45,7 @@ class GemmaWrapper(DecoderOnlyWrapper):
                     kvcache_partition_len=self.kvcache_partition_len,
                     use_attention_mask=self.use_attention_mask,
                     kvcache_block_size=self.kvcache_block_size,
+                    use_position_ids=self.use_position_ids,
                 )
             else:
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")

optimum/rbln/transformers/models/gemma3/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_gemma3 import RBLNGemma3ForCausalLMConfig, RBLNGemma3ForConditionalGenerationConfig
+from .modeling_gemma3 import RBLNGemma3ForCausalLM, RBLNGemma3ForConditionalGeneration

optimum-rbln 0.7.5a0__py3-none-any.whl → 0.7.5rc0__py3-none-any.whl

optimum-rbln 0.7.5a0py3-none-any.whl → 0.7.5rc0py3-none-any.whl