PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (54) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +32 -11
tests/test_utils.py +1 -2
tpu_inference/__init__.py +22 -3
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +3 -4
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +61 -8
tpu_inference/executors/ray_distributed_executor.py +31 -11
tpu_inference/kernels/fused_moe/v1/kernel.py +641 -110
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +213 -126
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +74 -25
tpu_inference/layers/vllm/quantization/common.py +6 -1
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -62
tpu_inference/layers/vllm/quantization/unquantized.py +107 -113
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +45 -11
tpu_inference/models/jax/llama3.py +2 -1
tpu_inference/models/jax/llama_eagle3.py +8 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +2 -1
tpu_inference/models/jax/qwen2_5_vl.py +163 -48
tpu_inference/models/jax/qwen3.py +2 -1
tpu_inference/models/jax/utils/quantization/quantization_utils.py +3 -6
tpu_inference/models/jax/utils/weight_utils.py +198 -143
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -7
tpu_inference/platforms/tpu_platform.py +28 -22
tpu_inference/runner/compilation_manager.py +144 -59
tpu_inference/runner/kv_cache_manager.py +17 -18
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +271 -147
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -21
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +36 -13
tpu_inference/worker/tpu_worker.py +162 -25
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/METADATA +3 -2
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/RECORD +48 -53
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -108,6 +108,8 @@ class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert isinstance(layer, LinearBase)
         with jax.named_scope(layer._get_name()):
             if in_sharding := self.jax_config.get_input_sharding(x):
                 x.shard_(NamedSharding(self.jax_config.mesh, in_sharding))
@@ -170,14 +172,14 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
         self.ep_axis_name = ep_axis_name
         # TODO: Use autotune table once we have it.
         self.block_size = {
-            "bt": 16,
-            "bf": 384,
-            "bd1": 512,
-            "bd2": 512,
-            "btc": 16,
-            "bfc": 384,
-            "bd1c": 256,
-            "bd2c": 256,
+            "bt": 64,
+            "bf": 1024,
+            "bd1": 1536,
+            "bd2": 1536,
+            "btc": 64,
+            "bfc": 1024,
+            "bd1c": 1536,
+            "bd2c": 1536,
         }
     def select_gemm_impl(
@@ -191,131 +193,119 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
-        available_devices = self.mesh.devices.flatten()
-        with jax.default_device(available_devices[0]):
-            w13_weight = t2j(layer.w13_weight, use_dlpack=False)
-            w2_weight = t2j(layer.w2_weight, use_dlpack=False)
+        w13_weight = t2j(layer.w13_weight, use_dlpack=False)
+        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-            if self.moe.has_bias:
-                w13_bias = t2j(layer.w13_bias, use_dlpack=False)
-                w2_bias = t2j(layer.w2_bias, use_dlpack=False)
-            if layer.activation == "swigluoai":
-                # When using swigluoai, vLLM splits gmm output in a interleaved way.
-                # However, interleaved split is not performant on TPU. Therefore,
-                # we preprocess the weight so that splitting gmm output by middle
-                # can still get the same result.
-                w1_weight = w13_weight[:, ::2, :]
-                w3_weight = w13_weight[:, 1::2, :]
-                w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+        if self.moe.has_bias:
+            w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+            w2_bias = t2j(layer.w2_bias, use_dlpack=False)
+        if layer.activation == "swigluoai":
+            # When using swigluoai, vLLM splits gmm output in a interleaved way.
+            # However, interleaved split is not performant on TPU. Therefore,
+            # we preprocess the weight so that splitting gmm output by middle
+            # can still get the same result.
+            w1_weight = w13_weight[:, ::2, :]
+            w3_weight = w13_weight[:, 1::2, :]
+            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
-                if self.moe.has_bias:
-                    w1_bias = w13_bias[:, ::2]
-                    w3_bias = w13_bias[:, 1::2]
-                    w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-            if self.use_kernel and layer.use_ep:
-                # Kernel expects:
-                # w13: (num_experts, 2, hidden_size, intermediate_size)
-                # w2: (num_experts, intermediate_size, hidden_size)
-                # Current format:
-                # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
-                # w2_weight: (num_experts, hidden_size, intermediate_size)
-                num_experts = w13_weight.shape[0]
-                intermediate_size = w13_weight.shape[1] // 2
-                hidden_size = w13_weight.shape[2]
+            if self.moe.has_bias:
+                w1_bias = w13_bias[:, ::2]
+                w3_bias = w13_bias[:, 1::2]
+                w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-                # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
-                w13_reshaped = w13_weight.reshape(num_experts, 2,
-                                                  intermediate_size,
-                                                  hidden_size)
-                w13_weight_transposed = jnp.transpose(w13_reshaped,
-                                                      (0, 1, 3, 2))
+        if self.use_kernel and layer.use_ep:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
+            w13_reshaped = w13_weight.reshape(num_experts, 2,
+                                              intermediate_size, hidden_size)
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+            # Apply EP sharding
+            w13_weight = jax.device_put(
+                w13_weight_transposed,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
+            w2_weight = jax.device_put(
+                w2_weight_transposed,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
-                # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
-                w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+            if self.moe.has_bias:
+                w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
                 # Apply EP sharding
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+        else:
+            # Original logic for non-kernel path
+            if layer.use_ep:
                 w13_weight = jax.device_put(
-                    w13_weight_transposed,
-                    Format(
-                        Layout((0, 1, 2, 3)),
-                        NamedSharding(self.mesh, P("model", None, None,
-                                                   None))))
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
                 w2_weight = jax.device_put(
-                    w2_weight_transposed,
+                    w2_weight,
                     Format(Layout((0, 1, 2)),
                            NamedSharding(self.mesh, P("model", None, None))))
                 if self.moe.has_bias:
-                    w13_bias = w13_bias.reshape(num_experts, 2,
-                                                intermediate_size)
-                    # Apply EP sharding
                     w13_bias = jax.device_put(
                         w13_bias,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None))))
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
                     w2_bias = jax.device_put(
                         w2_bias,
                         Format(Layout((0, 1)),
                                NamedSharding(self.mesh, P("model", None))))
             else:
-                # Original logic for non-kernel path
-                if layer.use_ep:
-                    w13_weight = jax.device_put(
-                        w13_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None))))
-                    w2_weight = jax.device_put(
-                        w2_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None))))
-                    if self.moe.has_bias:
-                        w13_bias = jax.device_put(
-                            w13_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P("model", None))))
-                        w2_bias = jax.device_put(
-                            w2_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P("model", None))))
-                else:
-                    intermediate_size = w13_weight.shape[1] // 2
-                    assert intermediate_size == w2_weight.shape[-1]
-                    output_sizes = [intermediate_size, intermediate_size]
-                    n_shards = self.mesh.shape["model"]
-                    assert intermediate_size % n_shards == 0
-                    w13_weight = reorder_concatenated_tensor_for_sharding(
-                        w13_weight, output_sizes, n_shards, dim=1)
-                    w13_weight = jax.device_put(
-                        w13_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P(None, "model", None))))
-                    w2_weight = jax.device_put(
-                        w2_weight,
-                        Format(
-                            Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P(None, None, "model"))))
-                    if self.moe.has_bias:
-                        w13_bias = reorder_concatenated_tensor_for_sharding(
-                            w13_bias, output_sizes, n_shards, dim=1)
-                        w13_bias = jax.device_put(
-                            w13_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P(None, "model"))))
-                        w2_bias = jax.device_put(
-                            w2_bias,
-                            Format(Layout((0, 1)),
-                                   NamedSharding(self.mesh, P(None, None))))
+                intermediate_size = w13_weight.shape[1] // 2
+                assert intermediate_size == w2_weight.shape[-1]
+                output_sizes = [intermediate_size, intermediate_size]
+                n_shards = self.mesh.shape["model"]
+                assert intermediate_size % n_shards == 0
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight, output_sizes, n_shards, dim=1)
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, "model", None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, None, "model"))))
+                if self.moe.has_bias:
+                    w13_bias = reorder_concatenated_tensor_for_sharding(
+                        w13_bias, output_sizes, n_shards, dim=1)
+                    w13_bias = jax.device_put(
+                        w13_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P(None, "model"))))
+                    w2_bias = jax.device_put(
+                        w2_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P(None, None))))
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
@@ -360,9 +350,13 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                 tokens=jax_view(x),
                 w1=jax_view(layer.w13_weight),
                 w2=jax_view(layer.w2_weight),
+                b1=jax_view(layer.w13_bias) if self.moe.has_bias else None,
+                b2=jax_view(layer.w2_bias) if self.moe.has_bias else None,
                 gating_output=jax_view(router_logits),
                 top_k=top_k,
                 ep_axis_name=self.ep_axis_name,
+                renormalize_topk_logits=renormalize,
+                act_fn=activation,
                 **self.block_size,
             )
         else:

tpu_inference/layers/vllm/sharding.py CHANGED Viewed

@@ -19,6 +19,7 @@ from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 P = PartitionSpec
@@ -211,8 +212,7 @@ def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
 def _sharded_device_put(tensor: jax.Array, sharding) -> jax.Array:
     if isinstance(tensor, tuple):
         return tuple(_sharded_device_put(t, sharding) for t in tensor)
-    import os
-    multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
+    multihost_backend = envs.TPU_MULTIHOST_BACKEND
     if multihost_backend != "ray":
         return jax.device_put(tensor, sharding)

tpu_inference/lora/torch_punica_tpu.py CHANGED Viewed

@@ -239,7 +239,6 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         lora_index_to_id: list[Optional[int]],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
         # Pad the prompt mapping to avoid running into recompiles on the TPU
         # TODO: Should this happen inside mapping internally? If so how can we
@@ -258,7 +257,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             lora_index_to_id,
             max_loras,
             vocab_size,
-            extra_vocab_size,
+            0,  # extra_vocab_size
             "cpu",
         )
         with torchax.default_env():

tpu_inference/models/common/model_loader.py CHANGED Viewed

@@ -8,6 +8,9 @@ from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from torchax.ops.mappings import j2t_dtype
 from transformers import PretrainedConfig
 from vllm.config import VllmConfig
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.model_executor.model_loader.runai_streamer_loader import \
+    RunaiModelStreamerLoader
 from vllm.utils.func_utils import supports_kw
 from tpu_inference import envs
@@ -36,19 +39,17 @@ def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
     from tpu_inference.models.jax.llama3 import LlamaForCausalLM
     from tpu_inference.models.jax.llama4 import Llama4ForCausalLM
     from tpu_inference.models.jax.llama_eagle3 import EagleLlama3ForCausalLM
-    from tpu_inference.models.jax.phi3 import Phi3ForCausalLM
-    from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
+    from tpu_inference.models.jax.llama_guard_4 import LlamaGuard4ForCausalLM
     from tpu_inference.models.jax.qwen2_5_vl import \
         Qwen2_5_VLForConditionalGeneration
     from tpu_inference.models.jax.qwen3 import Qwen3ForCausalLM
     _MODEL_REGISTRY["Llama4ForCausalLM"] = Llama4ForCausalLM
     _MODEL_REGISTRY["DeepseekV3ForCausalLM"] = DeepSeekV3
     _MODEL_REGISTRY["LlamaForCausalLM"] = LlamaForCausalLM
-    _MODEL_REGISTRY["Qwen2ForCausalLM"] = Qwen2ForCausalLM
+    _MODEL_REGISTRY["Llama4ForConditionalGeneration"] = LlamaGuard4ForCausalLM
     _MODEL_REGISTRY["Qwen3ForCausalLM"] = Qwen3ForCausalLM
     _MODEL_REGISTRY[
         "Qwen2_5_VLForConditionalGeneration"] = Qwen2_5_VLForConditionalGeneration
-    _MODEL_REGISTRY["Phi3ForCausalLM"] = Phi3ForCausalLM
     _MODEL_REGISTRY["Eagle3LlamaForCausalLM"] = EagleLlama3ForCausalLM
     _MODEL_REGISTRY["GptOssForCausalLM"] = GptOss
@@ -57,8 +58,10 @@ def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
         if arch in _MODEL_REGISTRY:
             return _MODEL_REGISTRY[arch]
     raise UnsupportedArchitectureError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
+        f"Model architectures {architectures} not "
+        "registered in tpu-inference. Falling back to vLLM-native "
+        f"Pytorch definition. JAX-native architectures: {list(_MODEL_REGISTRY.keys())}"
+    )
 def _get_nnx_model(
@@ -177,7 +180,23 @@ def _get_nnx_model(
         # the model creation again, otherwise the model forward will have
         # non-trivial overhead in PjitFunction.
         with mesh:
-            model.load_weights(rng)
+            loader = get_model_loader(vllm_config.load_config)
+            if isinstance(loader, RunaiModelStreamerLoader):
+                model_weights = vllm_config.model_config.model
+                if hasattr(vllm_config.model_config, "model_weights"):
+                    model_weights = vllm_config.model_config.model_weights
+                weights_iterator = loader._get_weights_iterator(
+                    model_weights, vllm_config.model_config.revision)
+                # We set the weights iterator at runtime, to prevent having to change
+                # every model's load_weights signature. This also prevents us from hitting
+                # a TypeError at runtime if you use the RunaiModelStreamerLoader with any
+                # flax_nnx model whose load_weights function does not accept the
+                # weights_iterator keyword argument.
+                vllm_config.model_config.model_weights_iterator = weights_iterator
+                model.load_weights(rng)
+                del vllm_config.model_config.model_weights_iterator
+            else:
+                model.load_weights(rng)
             jit_model = create_jit_model(
                 model,
                 use_qwix_on_abstract_model=should_apply_qwix_on_abstract_model)
@@ -217,7 +236,9 @@ def get_flax_model(
             hidden_states_sharding,  # aux hidden states
         ),
         donate_argnums=2,  # 0 is graphdef, 1 is state, 2 is kv_cache
-        static_argnums=6,  #6 is layer_name_to_kvcache_index
+        static_argnums=(
+            7, 10, 11
+        ),  #7 is layer_name_to_kvcache_index, 10 is is_first_rank, 11 is is_last_rank
     )
     def run_model(graphdef, state, *args):
         model = nnx.merge(graphdef, state)
@@ -242,10 +263,11 @@ def get_flax_model(
         model = nnx.merge(graphdef, state)
         return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
+    embed_sharding = NamedSharding(mesh, PartitionSpec(None))
     # This function will calculates the embeddings of input texts and then merge with the image embeddings
     @functools.partial(
         jax.jit,
-        out_shardings=(logits_sharding),
+        out_shardings=(embed_sharding),
     )
     def run_get_input_embeddings(graphdef, state, *args, **kwargs):
         model = nnx.merge(graphdef, state)
@@ -325,8 +347,8 @@ def get_model(
             # Convert the error message to a string to check its contents
             error_msg = str(e)
-            logger.warning(f"Flax model failed with: '{error_msg}'. "
-                           "Falling back to vLLM implementation.")
+            logger.warning(error_msg)
             # Fall back to the vLLM model and updating the dtype accordingly
             vllm_config.model_config.dtype = j2t_dtype(
                 vllm_config.model_config.dtype.dtype)
@@ -420,6 +442,17 @@ def register_model(arch: str, model: Any) -> None:
             "This is a JAX model and does not implement the PyTorch forward method."
         )
+    # Same as `forward`, this is a dummy method to satisfy vLLM's type checks.
+    def unimplemented_get_input_embeddings(
+        self,
+        input_ids: "torch.Tensor",
+        positions: "torch.Tensor",
+        inputs_embeds: Optional["torch.Tensor"] = None,
+    ) -> "torch.Tensor":
+        raise NotImplementedError(
+            "This is a JAX model and does not implement the PyTorch get_input_embeddings method."
+        )
     # We need a custom __init__ that only calls torch.nn.Module's init,
     # to avoid triggering JAX logic when vLLM inspects the class.
     def wrapper_init(self, *args, **kwargs):
@@ -433,6 +466,7 @@ def register_model(arch: str, model: Any) -> None:
         {
             "__init__": wrapper_init,
             "forward": unimplemented_forward,
+            "get_input_embeddings": unimplemented_get_input_embeddings,
             # Prevent vLLM from trying to load weights into this dummy class.
             "load_weights": lambda self, *args, **kwargs: None,
         })

tpu_inference/models/jax/llama3.py CHANGED Viewed

@@ -368,7 +368,8 @@ class LlamaForCausalLM(nnx.Module):
                 "lm_head": "model.lm_head",
             })
-        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
+        metadata_map = get_default_maps(self.vllm_config.model_config,
+                                        self.mesh, mappings)
         load_hf_weights(vllm_config=self.vllm_config,
                         model=self,
                         metadata_map=metadata_map,

tpu_inference/models/jax/llama_eagle3.py CHANGED Viewed

@@ -194,13 +194,12 @@ class Eagle3LlamaModel(nnx.Module):
 def update_reshape_map_for_eagle3(vllm_config: VllmConfig,
                                   metadata_map: MetadataMap):
-    model_config = vllm_config.model_config
+    model_config = vllm_config.speculative_config.draft_model_config
     hf_config = model_config.hf_config
     num_heads = hf_config.num_attention_heads
     num_kv_heads = hf_config.num_key_value_heads
-    hidden_size = model_config.get_hidden_size()
+    hidden_size = hf_config.hidden_size
     head_dim_original = model_config.get_head_size()
     metadata_map.reshape_map.update({
@@ -305,6 +304,8 @@ class EagleLlama3ForCausalLM(nnx.Module):
             "fc": "model.fc.kernel",
             "lm_head": "lm_head.kernel",
             "d2t": "draft_id_to_target_id",
+            "embed_tokens":
+            "model.embed_tokens.embedding",  # Some checkpoints need this
         }
         # Define keys to keep in original dtype (e.g., float32 for stability)
@@ -312,7 +313,9 @@ class EagleLlama3ForCausalLM(nnx.Module):
             r".*d2t.*",
         ]
-        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
+        metadata_map = get_default_maps(
+            self.vllm_config.speculative_config.draft_model_config, self.mesh,
+            mappings)
         update_reshape_map_for_eagle3(self.vllm_config, metadata_map)
@@ -324,7 +327,7 @@ class EagleLlama3ForCausalLM(nnx.Module):
             is_draft_model=True,
             keep_original_dtype_keys_regex=keep_original_dtype_keys_regex)
-        # If the embedding is not initialized, initialize it with a dummpy array here to pass jit compilation. The real weights will be shared from the target model in eagle3 class.
+        # If the embedding is not initialized, initialize it with a dummy array here to pass jit compilation. The real weights will be shared from the target model in eagle3 class.
         if isinstance(self.model.embed_tokens.embedding.value,
                       jax.ShapeDtypeStruct):
             self.model.embed_tokens.embedding.value = jnp.zeros(

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl