PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (58) hide show

tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +182 -0
tests/test_utils.py +23 -14
tpu_inference/__init__.py +22 -3
tpu_inference/core/core_tpu.py +17 -9
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +2 -3
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +1 -1
tpu_inference/executors/ray_distributed_executor.py +27 -11
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +110 -64
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +7 -0
tpu_inference/layers/{jax → common}/attention_interface.py +1 -1
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/quantization/__init__.py +7 -3
tpu_inference/layers/vllm/quantization/awq.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -2
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +4 -3
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +12 -11
tpu_inference/models/jax/llama3.py +4 -3
tpu_inference/models/jax/llama_eagle3.py +9 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +3 -2
tpu_inference/models/jax/qwen2_5_vl.py +4 -3
tpu_inference/models/jax/qwen3.py +3 -2
tpu_inference/models/jax/utils/weight_utils.py +21 -8
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -10
tpu_inference/platforms/tpu_platform.py +17 -7
tpu_inference/runner/compilation_manager.py +37 -17
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +8 -2
tpu_inference/runner/tpu_runner.py +199 -87
tpu_inference/spec_decode/jax/eagle3.py +2 -1
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +7 -6
tpu_inference/worker/tpu_worker.py +159 -23
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/METADATA +2 -2
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/RECORD +52 -54
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
/tpu_inference/layers/{jax → common}/binary_search.py +0 -0
/tpu_inference/layers/{jax → common}/sharding.py +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/mxfp4.py ADDED Viewed

@@ -0,0 +1,266 @@
+from typing import Callable, Optional, Union
+import jax
+import jax.numpy as jnp
+import torch
+from jax.experimental.layout import Format, Layout
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torch.nn.parameter import Parameter
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig, FusedMoEQuantConfig, biased_moe_quant_config)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizeMethodBase
+from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
+                                                           Mxfp4Config,
+                                                           Mxfp4MoEMethod)
+from vllm.model_executor.layers.quantization.utils.quant_utils import \
+    is_layer_skipped
+from tpu_inference.layers.common.quant_methods import (MXFP4,
+                                                       get_tpu_quant_method)
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
+from tpu_inference.layers.vllm.linear_common import \
+    reorder_concatenated_tensor_for_sharding
+from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedLinearMethod
+MXFP4_BLOCK_SIZE = 32
+P = PartitionSpec
+logger = init_logger(__name__)
+# TODO(kyuyeunk): Move these functions into a common utility file.
+def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
+    assert u8_packed_e2m1.dtype == jnp.uint8
+    e2m1 = jax.lax.bitcast_convert_type(u8_packed_e2m1, jnp.float4_e2m1fn)
+    # bitcast creates one more dimension that splits 8 bits into two e2m1.
+    # we flatten them with the last dim.
+    return jnp.reshape(e2m1, e2m1.shape[:-2] + (-1, ))
+def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
+    e8_finfo = jnp.finfo(jnp.float8_e8m0fnu)
+    exponents = u8.astype(jnp.int32) + e8_finfo.minexp
+    ones = jnp.ones_like(u8, dtype=jnp.float32)
+    return jnp.ldexp(ones, exponents)
+def dequantize_block_weight(weight: jax.Array,
+                            scale: jax.Array,
+                            block_size: int,
+                            out_dtype: jnp.dtype = jnp.bfloat16) -> jax.Array:
+    orig_shape = weight.shape
+    weight_block = weight.reshape(orig_shape[:-1] + (-1, block_size))
+    weight_dequantized = weight_block.astype(jnp.float32) * jnp.expand_dims(
+        scale, -1)
+    return weight_dequantized.reshape(orig_shape).astype(out_dtype)
+@register_quantization_config(get_tpu_quant_method(MXFP4))
+class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
+    @classmethod
+    def get_name(cls):
+        return MXFP4
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            linear_config = self.get_linear_config(layer)
+            if self.ignored_layers and is_layer_skipped(
+                    prefix=prefix,
+                    ignored_layers=self.ignored_layers,
+                    fused_mapping=self.packed_modules_mapping,
+            ):
+                return VllmUnquantizedLinearMethod(linear_config)
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that
+            # implementation if you are interested in enabling MXFP4 here.
+            logger.warning_once(
+                "MXFP4 linear layer is not implemented - falling back to "
+                "UnquantizedLinearMethod.")
+            return VllmUnquantizedLinearMethod(linear_config)
+        elif isinstance(layer, FusedMoE):
+            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+        elif isinstance(layer, Attention):
+            # TODO: Add support for MXFP4 Attention.
+            logger.warning_once("MXFP4 attention layer is not implemented. "
+                                "Skipping quantization for this layer.")
+        return None
+class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
+    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+        FusedMoEMethodBase.__init__(self, moe)
+        # We piggyback on triton implementation as it applies minimal hardware
+        # specific post processing to the weights.
+        self.mxfp4_backend = Mxfp4Backend.TRITON
+        self.mesh = mesh
+    def get_fused_moe_quant_config(
+            self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
+        # Because we have dequantized weights, we only need biased moe config.
+        # TODO(kyuyeunk): Add native support for MXFP4.
+        return biased_moe_quant_config(
+            layer.w13_bias,
+            layer.w2_bias,
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        assert isinstance(layer, FusedMoE)
+        w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
+        w13_weight_scale = e8m0_to_fp32(
+            t2j(layer.w13_weight_scale, use_dlpack=False))
+        w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+        w2_weight = u8_unpack_e2m1(t2j(layer.w2_weight, use_dlpack=False))
+        w2_weight_scale = e8m0_to_fp32(
+            t2j(layer.w2_weight_scale, use_dlpack=False))
+        w2_bias = t2j(layer.w2_bias, use_dlpack=False)
+        # We dequantize fp4 weights into bf16.
+        # TODO(kyuyeunk): Add native support for MXFP4.
+        w13_weight = dequantize_block_weight(w13_weight, w13_weight_scale,
+                                             MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        w2_weight = dequantize_block_weight(w2_weight, w2_weight_scale,
+                                            MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        # Because we have dequantized weights, scales are not used anymore.
+        delattr(layer, "w13_weight_scale")
+        delattr(layer, "w2_weight_scale")
+        if layer.activation == "swigluoai":
+            # When using swigluoai, vLLM splits gmm output in a interleaved way.
+            # However, interleaved split is not performant on TPU. Therefore,
+            # we preprocess the weight so that splitting gmm output by middle
+            # can still get the same result.
+            w1_weight = w13_weight[:, ::2, :]
+            w3_weight = w13_weight[:, 1::2, :]
+            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+            w1_bias = w13_bias[:, ::2]
+            w3_bias = w13_bias[:, 1::2]
+            w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
+        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
+        if layer.use_ep:
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+        else:
+            intermediate_size = w13_weight.shape[1] // 2
+            assert intermediate_size == w2_weight.shape[-1]
+            output_sizes = [intermediate_size, intermediate_size]
+            n_shards = self.mesh.shape["model"]
+            assert intermediate_size % n_shards == 0
+            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
+                                                                  output_sizes,
+                                                                  n_shards,
+                                                                  dim=1)
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, "model", None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, None, "model"))))
+            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
+                                                                output_sizes,
+                                                                n_shards,
+                                                                dim=1)
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P(None, "model"))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
+                                                                  None))))
+        layer.w13_weight = Parameter(torch_view(w13_weight),
+                                     requires_grad=False)
+        layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
+        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
+        layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
+        pass
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert isinstance(layer, FusedMoE)
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax is supported for scoring_func")
+        # Use the original implementation
+        output = fused_moe_func_padded(
+            jax_view(x),
+            jax_view(layer.w13_weight),
+            jax_view(layer.w2_weight),
+            jax_view(layer.w13_bias) if self.moe.has_bias else None,
+            jax_view(layer.w2_bias) if self.moe.has_bias else None,
+            jax_view(router_logits),
+            topk=top_k,
+            global_num_experts=global_num_experts,
+            renormalize=renormalize,
+            reduce_results=layer.reduce_results,
+            mesh=self.mesh,
+            use_ep=layer.use_ep,
+            activation=activation,
+        )
+        return torch_view(output)

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -23,6 +23,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from tpu_inference import envs
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
+from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
+                                                       get_tpu_quant_method)
 from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
@@ -34,12 +36,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config("jax-unquantized")
+@register_quantization_config(get_tpu_quant_method(UNQUANTIZED))
 class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
     @classmethod
     def get_name(cls) -> str:
-        return "jax-unquantized"
+        return UNQUANTIZED
     @classmethod
     def get_supported_act_dtypes(cls) -> list[torch.dtype]:
@@ -189,7 +191,6 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)

tpu_inference/layers/vllm/sharding.py CHANGED Viewed

@@ -19,6 +19,7 @@ from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 P = PartitionSpec
@@ -211,8 +212,7 @@ def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
 def _sharded_device_put(tensor: jax.Array, sharding) -> jax.Array:
     if isinstance(tensor, tuple):
         return tuple(_sharded_device_put(t, sharding) for t in tensor)
-    import os
-    multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
+    multihost_backend = envs.TPU_MULTIHOST_BACKEND
     if multihost_backend != "ray":
         return jax.device_put(tensor, sharding)

tpu_inference/lora/torch_punica_tpu.py CHANGED Viewed

@@ -239,7 +239,6 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         lora_index_to_id: list[Optional[int]],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
         # Pad the prompt mapping to avoid running into recompiles on the TPU
         # TODO: Should this happen inside mapping internally? If so how can we
@@ -258,7 +257,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             lora_index_to_id,
             max_loras,
             vocab_size,
-            extra_vocab_size,
+            0,  # extra_vocab_size
             "cpu",
         )
         with torchax.default_env():

tpu_inference/models/common/model_loader.py CHANGED Viewed

@@ -11,7 +11,7 @@ from vllm.config import VllmConfig
 from vllm.utils.func_utils import supports_kw
 from tpu_inference import envs
-from tpu_inference.layers.jax.sharding import ShardingAxisName
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.quantization.quantization_utils import (
     apply_qwix_on_abstract_model, apply_qwix_quantization,
@@ -36,19 +36,17 @@ def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
     from tpu_inference.models.jax.llama3 import LlamaForCausalLM
     from tpu_inference.models.jax.llama4 import Llama4ForCausalLM
     from tpu_inference.models.jax.llama_eagle3 import EagleLlama3ForCausalLM
-    from tpu_inference.models.jax.phi3 import Phi3ForCausalLM
-    from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
+    from tpu_inference.models.jax.llama_guard_4 import LlamaGuard4ForCausalLM
     from tpu_inference.models.jax.qwen2_5_vl import \
         Qwen2_5_VLForConditionalGeneration
     from tpu_inference.models.jax.qwen3 import Qwen3ForCausalLM
     _MODEL_REGISTRY["Llama4ForCausalLM"] = Llama4ForCausalLM
     _MODEL_REGISTRY["DeepseekV3ForCausalLM"] = DeepSeekV3
     _MODEL_REGISTRY["LlamaForCausalLM"] = LlamaForCausalLM
-    _MODEL_REGISTRY["Qwen2ForCausalLM"] = Qwen2ForCausalLM
+    _MODEL_REGISTRY["Llama4ForConditionalGeneration"] = LlamaGuard4ForCausalLM
     _MODEL_REGISTRY["Qwen3ForCausalLM"] = Qwen3ForCausalLM
     _MODEL_REGISTRY[
         "Qwen2_5_VLForConditionalGeneration"] = Qwen2_5_VLForConditionalGeneration
-    _MODEL_REGISTRY["Phi3ForCausalLM"] = Phi3ForCausalLM
     _MODEL_REGISTRY["Eagle3LlamaForCausalLM"] = EagleLlama3ForCausalLM
     _MODEL_REGISTRY["GptOssForCausalLM"] = GptOss
@@ -57,8 +55,10 @@ def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
         if arch in _MODEL_REGISTRY:
             return _MODEL_REGISTRY[arch]
     raise UnsupportedArchitectureError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
+        f"Model architectures {architectures} not "
+        "registered in tpu-inference. Falling back to vLLM-native "
+        f"Pytorch definition. JAX-native architectures: {list(_MODEL_REGISTRY.keys())}"
+    )
 def _get_nnx_model(
@@ -217,7 +217,7 @@ def get_flax_model(
             hidden_states_sharding,  # aux hidden states
         ),
         donate_argnums=2,  # 0 is graphdef, 1 is state, 2 is kv_cache
-        static_argnums=6,  #6 is layer_name_to_kvcache_index
+        static_argnums=7,  #7 is layer_name_to_kvcache_index
     )
     def run_model(graphdef, state, *args):
         model = nnx.merge(graphdef, state)
@@ -242,10 +242,11 @@ def get_flax_model(
         model = nnx.merge(graphdef, state)
         return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
+    embed_sharding = NamedSharding(mesh, PartitionSpec(None))
     # This function will calculates the embeddings of input texts and then merge with the image embeddings
     @functools.partial(
         jax.jit,
-        out_shardings=(logits_sharding),
+        out_shardings=(embed_sharding),
     )
     def run_get_input_embeddings(graphdef, state, *args, **kwargs):
         model = nnx.merge(graphdef, state)
@@ -325,8 +326,8 @@ def get_model(
             # Convert the error message to a string to check its contents
             error_msg = str(e)
-            logger.warning(f"Flax model failed with: '{error_msg}'. "
-                           "Falling back to vLLM implementation.")
+            logger.warning(error_msg)
             # Fall back to the vLLM model and updating the dtype accordingly
             vllm_config.model_config.dtype = j2t_dtype(
                 vllm_config.model_config.dtype.dtype)

tpu_inference/models/jax/llama3.py CHANGED Viewed

@@ -8,10 +8,10 @@ from transformers import LlamaConfig, modeling_flax_utils
 from vllm.config import VllmConfig
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.rope_interface import apply_rope
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
                                                          load_hf_weights)
@@ -368,7 +368,8 @@ class LlamaForCausalLM(nnx.Module):
                 "lm_head": "model.lm_head",
             })
-        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
+        metadata_map = get_default_maps(self.vllm_config.model_config,
+                                        self.mesh, mappings)
         load_hf_weights(vllm_config=self.vllm_config,
                         model=self,
                         metadata_map=metadata_map,

tpu_inference/models/jax/llama_eagle3.py CHANGED Viewed

@@ -194,13 +194,12 @@ class Eagle3LlamaModel(nnx.Module):
 def update_reshape_map_for_eagle3(vllm_config: VllmConfig,
                                   metadata_map: MetadataMap):
-    model_config = vllm_config.model_config
+    model_config = vllm_config.speculative_config.draft_model_config
     hf_config = model_config.hf_config
     num_heads = hf_config.num_attention_heads
     num_kv_heads = hf_config.num_key_value_heads
-    hidden_size = model_config.get_hidden_size()
+    hidden_size = hf_config.hidden_size
     head_dim_original = model_config.get_head_size()
     metadata_map.reshape_map.update({
@@ -312,7 +311,11 @@ class EagleLlama3ForCausalLM(nnx.Module):
             r".*d2t.*",
         ]
-        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
+        # `embed_tokens` is shared between target and draft.
+        exclude_regex = [r".*embed_tokens.*"]
+        metadata_map = get_default_maps(
+            self.vllm_config.speculative_config.draft_model_config, self.mesh,
+            mappings)
         update_reshape_map_for_eagle3(self.vllm_config, metadata_map)
@@ -322,7 +325,8 @@ class EagleLlama3ForCausalLM(nnx.Module):
             metadata_map=metadata_map,
             mesh=self.mesh,
             is_draft_model=True,
-            keep_original_dtype_keys_regex=keep_original_dtype_keys_regex)
+            keep_original_dtype_keys_regex=keep_original_dtype_keys_regex,
+            exclude_regex=exclude_regex if exclude_regex else None)
         # If the embedding is not initialized, initialize it with a dummpy array here to pass jit compilation. The real weights will be shared from the target model in eagle3 class.
         if isinstance(self.model.embed_tokens.embedding.value,

tpu-inference 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl