PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (37) hide show

tpu_inference/layers/vllm/quantization/mxfp4.py ADDED Viewed

@@ -0,0 +1,266 @@
+from typing import Callable, Optional, Union
+import jax
+import jax.numpy as jnp
+import torch
+from jax.experimental.layout import Format, Layout
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torch.nn.parameter import Parameter
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig, FusedMoEQuantConfig, biased_moe_quant_config)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizeMethodBase
+from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
+                                                           Mxfp4Config,
+                                                           Mxfp4MoEMethod)
+from vllm.model_executor.layers.quantization.utils.quant_utils import \
+    is_layer_skipped
+from tpu_inference.layers.common.quant_methods import (MXFP4,
+                                                       get_tpu_quant_method)
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
+from tpu_inference.layers.vllm.linear_common import \
+    reorder_concatenated_tensor_for_sharding
+from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedLinearMethod
+MXFP4_BLOCK_SIZE = 32
+P = PartitionSpec
+logger = init_logger(__name__)
+# TODO(kyuyeunk): Move these functions into a common utility file.
+def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
+    assert u8_packed_e2m1.dtype == jnp.uint8
+    e2m1 = jax.lax.bitcast_convert_type(u8_packed_e2m1, jnp.float4_e2m1fn)
+    # bitcast creates one more dimension that splits 8 bits into two e2m1.
+    # we flatten them with the last dim.
+    return jnp.reshape(e2m1, e2m1.shape[:-2] + (-1, ))
+def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
+    e8_finfo = jnp.finfo(jnp.float8_e8m0fnu)
+    exponents = u8.astype(jnp.int32) + e8_finfo.minexp
+    ones = jnp.ones_like(u8, dtype=jnp.float32)
+    return jnp.ldexp(ones, exponents)
+def dequantize_block_weight(weight: jax.Array,
+                            scale: jax.Array,
+                            block_size: int,
+                            out_dtype: jnp.dtype = jnp.bfloat16) -> jax.Array:
+    orig_shape = weight.shape
+    weight_block = weight.reshape(orig_shape[:-1] + (-1, block_size))
+    weight_dequantized = weight_block.astype(jnp.float32) * jnp.expand_dims(
+        scale, -1)
+    return weight_dequantized.reshape(orig_shape).astype(out_dtype)
+@register_quantization_config(get_tpu_quant_method(MXFP4))
+class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
+    @classmethod
+    def get_name(cls):
+        return MXFP4
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            linear_config = self.get_linear_config(layer)
+            if self.ignored_layers and is_layer_skipped(
+                    prefix=prefix,
+                    ignored_layers=self.ignored_layers,
+                    fused_mapping=self.packed_modules_mapping,
+            ):
+                return VllmUnquantizedLinearMethod(linear_config)
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that
+            # implementation if you are interested in enabling MXFP4 here.
+            logger.warning_once(
+                "MXFP4 linear layer is not implemented - falling back to "
+                "UnquantizedLinearMethod.")
+            return VllmUnquantizedLinearMethod(linear_config)
+        elif isinstance(layer, FusedMoE):
+            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+        elif isinstance(layer, Attention):
+            # TODO: Add support for MXFP4 Attention.
+            logger.warning_once("MXFP4 attention layer is not implemented. "
+                                "Skipping quantization for this layer.")
+        return None
+class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
+    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+        FusedMoEMethodBase.__init__(self, moe)
+        # We piggyback on triton implementation as it applies minimal hardware
+        # specific post processing to the weights.
+        self.mxfp4_backend = Mxfp4Backend.TRITON
+        self.mesh = mesh
+    def get_fused_moe_quant_config(
+            self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
+        # Because we have dequantized weights, we only need biased moe config.
+        # TODO(kyuyeunk): Add native support for MXFP4.
+        return biased_moe_quant_config(
+            layer.w13_bias,
+            layer.w2_bias,
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        assert isinstance(layer, FusedMoE)
+        w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
+        w13_weight_scale = e8m0_to_fp32(
+            t2j(layer.w13_weight_scale, use_dlpack=False))
+        w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+        w2_weight = u8_unpack_e2m1(t2j(layer.w2_weight, use_dlpack=False))
+        w2_weight_scale = e8m0_to_fp32(
+            t2j(layer.w2_weight_scale, use_dlpack=False))
+        w2_bias = t2j(layer.w2_bias, use_dlpack=False)
+        # We dequantize fp4 weights into bf16.
+        # TODO(kyuyeunk): Add native support for MXFP4.
+        w13_weight = dequantize_block_weight(w13_weight, w13_weight_scale,
+                                             MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        w2_weight = dequantize_block_weight(w2_weight, w2_weight_scale,
+                                            MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        # Because we have dequantized weights, scales are not used anymore.
+        delattr(layer, "w13_weight_scale")
+        delattr(layer, "w2_weight_scale")
+        if layer.activation == "swigluoai":
+            # When using swigluoai, vLLM splits gmm output in a interleaved way.
+            # However, interleaved split is not performant on TPU. Therefore,
+            # we preprocess the weight so that splitting gmm output by middle
+            # can still get the same result.
+            w1_weight = w13_weight[:, ::2, :]
+            w3_weight = w13_weight[:, 1::2, :]
+            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+            w1_bias = w13_bias[:, ::2]
+            w3_bias = w13_bias[:, 1::2]
+            w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
+        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
+        if layer.use_ep:
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+        else:
+            intermediate_size = w13_weight.shape[1] // 2
+            assert intermediate_size == w2_weight.shape[-1]
+            output_sizes = [intermediate_size, intermediate_size]
+            n_shards = self.mesh.shape["model"]
+            assert intermediate_size % n_shards == 0
+            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
+                                                                  output_sizes,
+                                                                  n_shards,
+                                                                  dim=1)
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, "model", None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, None, "model"))))
+            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
+                                                                output_sizes,
+                                                                n_shards,
+                                                                dim=1)
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P(None, "model"))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
+                                                                  None))))
+        layer.w13_weight = Parameter(torch_view(w13_weight),
+                                     requires_grad=False)
+        layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
+        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
+        layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
+        pass
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert isinstance(layer, FusedMoE)
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax is supported for scoring_func")
+        # Use the original implementation
+        output = fused_moe_func_padded(
+            jax_view(x),
+            jax_view(layer.w13_weight),
+            jax_view(layer.w2_weight),
+            jax_view(layer.w13_bias) if self.moe.has_bias else None,
+            jax_view(layer.w2_bias) if self.moe.has_bias else None,
+            jax_view(router_logits),
+            topk=top_k,
+            global_num_experts=global_num_experts,
+            renormalize=renormalize,
+            reduce_results=layer.reduce_results,
+            mesh=self.mesh,
+            use_ep=layer.use_ep,
+            activation=activation,
+        )
+        return torch_view(output)

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -23,6 +23,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from tpu_inference import envs
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
+from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
+                                                       get_tpu_quant_method)
 from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
@@ -34,12 +36,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config("jax-unquantized")
+@register_quantization_config(get_tpu_quant_method(UNQUANTIZED))
 class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
     @classmethod
     def get_name(cls) -> str:
-        return "jax-unquantized"
+        return UNQUANTIZED
     @classmethod
     def get_supported_act_dtypes(cls) -> list[torch.dtype]:
@@ -189,7 +191,6 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)

tpu_inference/models/common/model_loader.py CHANGED Viewed

@@ -11,7 +11,7 @@ from vllm.config import VllmConfig
 from vllm.utils.func_utils import supports_kw
 from tpu_inference import envs
-from tpu_inference.layers.jax.sharding import ShardingAxisName
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.quantization.quantization_utils import (
     apply_qwix_on_abstract_model, apply_qwix_quantization,
@@ -242,10 +242,11 @@ def get_flax_model(
         model = nnx.merge(graphdef, state)
         return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
+    embed_sharding = NamedSharding(mesh, PartitionSpec(None))
     # This function will calculates the embeddings of input texts and then merge with the image embeddings
     @functools.partial(
         jax.jit,
-        out_shardings=(logits_sharding),
+        out_shardings=(embed_sharding),
     )
     def run_get_input_embeddings(graphdef, state, *args, **kwargs):
         model = nnx.merge(graphdef, state)

tpu_inference/models/jax/llama3.py CHANGED Viewed

@@ -8,10 +8,10 @@ from transformers import LlamaConfig, modeling_flax_utils
 from vllm.config import VllmConfig
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.rope_interface import apply_rope
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
                                                          load_hf_weights)

tpu_inference/models/jax/phi3.py CHANGED Viewed

@@ -8,8 +8,8 @@ from transformers import Phi3Config, modeling_flax_utils
 from vllm.config import VllmConfig
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.layers.jax.rope_interface import apply_longrope, apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (MetadataMap,

tpu_inference/models/jax/qwen2.py CHANGED Viewed

@@ -8,8 +8,8 @@ from transformers import Qwen2Config, modeling_flax_utils
 from vllm.config import VllmConfig
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,

tpu_inference/models/jax/qwen2_5_vl.py CHANGED Viewed

@@ -14,9 +14,9 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
 from vllm.config import VllmConfig
 from tpu_inference import utils as utils
-from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import \
+from tpu_inference.layers.common.attention_interface import \
     sharded_flash_attention
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
 # from vllm.model_executor.models.interfaces import MultiModalEmbeddings

tpu_inference/models/jax/qwen3.py CHANGED Viewed

@@ -8,8 +8,8 @@ from transformers import Qwen3Config
 from vllm.config import VllmConfig
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.qwen2 import Qwen2DecoderLayer

tpu_inference/models/vllm/vllm_model_wrapper.py CHANGED Viewed

@@ -25,6 +25,8 @@ from tpu_inference.layers.common.attention_metadata import AttentionMetadata
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
 from tpu_inference.layers.vllm.sharding import shard_model_to_tpu
 from tpu_inference.logger import init_logger
+from tpu_inference.models.jax.jax_intermediate_tensor import \
+    JaxIntermediateTensors
 from tpu_inference.models.vllm.vllm_model_wrapper_context import (
     get_vllm_model_wrapper_context, set_vllm_model_wrapper_context)
 from tpu_inference.runner.lora_utils import replace_lora_metadata
@@ -89,13 +91,14 @@ class VllmModelWrapper:
             slice_config = self.vllm_config.device_config.slice
             modified_slice_config = True
             self.vllm_config.device_config.slice = None
+        self.vllm_config.compilation_config.static_forward_context.clear()
         vllm_config_for_load = copy.deepcopy(self.vllm_config)
         if modified_slice_config:
             self.vllm_config.device_config.slice = slice_config
         assert self.vllm_config.model_config.dtype in TORCH_DTYPE_TO_JAX, "The model_config.dtype must be a PyTorch dtype."
         vllm_config_for_load.device_config.device = "cpu"
         # Clearing the cached compilation config, otherwise vllm model init will fail
-        vllm_config_for_load.compilation_config.static_forward_context.clear()
         # When expert parallelism is enabled, vLLM loads weight in sharding
         # aware manner. Since tpu-inference has its own sharding logic, this
@@ -117,7 +120,8 @@ class VllmModelWrapper:
         # Load the vLLM model and wrap it into a new model whose forward
         # function can calculate the hidden_state and logits.
-        with load_context:
+        available_devices = self.mesh.devices.flatten()
+        with load_context, jax.default_device(available_devices[0]):
             vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
         lora_manager = None
         if vllm_config_for_load.lora_config is not None:
@@ -149,7 +153,8 @@ class VllmModelWrapper:
                 "xla_tpu_reduce_scatter_collective_matmul_mode":
                 "post_spmd_conservative"
             },
-            static_argnames=("layer_name_to_kvcache_index", ),
+            static_argnames=("layer_name_to_kvcache_index", "is_first_rank",
+                             "is_last_rank"),
         )
         def step_fun(
             params_and_buffers,  # This has been wrapped into torchax TorchValue
@@ -159,6 +164,9 @@ class VllmModelWrapper:
             input_embeds: jax.Array,
             layer_name_to_kvcache_index: Sequence[Tuple[str, int]],
             lora_metadata,
+            intermediate_tensors: JaxIntermediateTensors = None,
+            is_first_rank: bool = True,
+            is_last_rank: bool = True,
             *args,
         ) -> Tuple[List[jax.Array], jax.Array]:
             layer_name_to_kvcache_index = dict(layer_name_to_kvcache_index)
@@ -173,13 +181,15 @@ class VllmModelWrapper:
                 # torch_view in order to call the Torch function.
                 original_lora_metadata = replace_lora_metadata(
                     self.model, lora_metadata, self.vllm_config.lora_config)
-                hidden_states = torch.func.functional_call(
+                if not is_first_rank:
+                    intermediate_tensors = intermediate_tensors.to_torch()
+                output_from_torch = torch.func.functional_call(
                     self.model,
                     torch_view(params_and_buffers),
                     kwargs={
                         "input_ids": torch_view(input_ids),
                         "positions": torch_view(attn_metadata.input_positions),
-                        "intermediate_tensors": None,
+                        "intermediate_tensors": intermediate_tensors,
                         "inputs_embeds": None,
                     },
                     tie_weights=False,
@@ -188,11 +198,13 @@ class VllmModelWrapper:
                                       self.vllm_config.lora_config)
                 vllm_model_wrapper_context = get_vllm_model_wrapper_context()
                 new_kv_caches = vllm_model_wrapper_context.kv_caches
-            # Wrap the hidden_states from torch land into a JaxValue for the jax
-            # code to consume.
-            hidden_states = jax_view(hidden_states)
-            return new_kv_caches, hidden_states, []
+            # Wrap the output(hidden states or intermediate tensor)
+            # from torch land into a JaxValue for the jax code to consume.
+            if not is_last_rank:
+                output = JaxIntermediateTensors.from_torch(output_from_torch)
+            else:
+                output = jax_view(output_from_torch)
+            return new_kv_caches, output, []
         return step_fun

tpu_inference/platforms/tpu_platform.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
-from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
 import jax.numpy as jnp
 import vllm.envs as vllm_envs
@@ -12,7 +12,7 @@ from vllm.platforms.interface import Platform, PlatformEnum
 from vllm.sampling_params import SamplingParams, SamplingType
 from tpu_inference import envs
-from tpu_inference.layers.jax.sharding import ShardingConfigManager
+from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
 if TYPE_CHECKING:
@@ -57,7 +57,8 @@ class TpuPlatform(Platform):
     def get_attn_backend_cls(cls, selected_backend: "_Backend", head_size: int,
                              dtype: jnp.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool, use_mla: bool,
-                             has_sink: bool, use_sparse: bool) -> str:
+                             has_sink: bool, use_sparse: bool,
+                             attn_type: Any) -> str:
         from vllm.attention.backends.registry import _Backend
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
@@ -184,8 +185,14 @@ class TpuPlatform(Platform):
         multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
         if not multihost_backend:  # Single host
-            logger.info("Force using UniProcExecutor for JAX on single host.")
-            parallel_config.distributed_executor_backend = "uni"
+            if parallel_config.pipeline_parallel_size == 1:
+                logger.info("Force using UniProcExecutor for JAX on \
+                        single host without pipeline parallelism.")
+                parallel_config.distributed_executor_backend = "uni"
+            else:
+                logger.info("Force using MultiprocExecutor for JAX on \
+                        single host with pipeline parallelism.")
+                parallel_config.distributed_executor_backend = "mp"
         elif multihost_backend == "ray":
             from tpu_inference.executors.ray_distributed_executor import \
                 RayDistributedExecutor

tpu_inference/runner/compilation_manager.py CHANGED Viewed

@@ -10,10 +10,10 @@ from jax.sharding import NamedSharding, PartitionSpec
 from tpu_inference.core.disagg_utils import is_disagg_enabled
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.sample.sampling import sample
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.utils import device_array
@@ -332,13 +332,15 @@ class CompilationManager:
             index_paddings = self.runner.num_reqs_paddings
         dp_sharding = NamedSharding(self.runner.mesh,
                                     PartitionSpec(ShardingAxisName.ATTN_DATA))
+        hidden_states_sharding = NamedSharding(
+            self.runner.mesh, PartitionSpec(ShardingAxisName.ATTN_DATA, None))
         dp_size = self.runner.vllm_config.sharding_config.total_dp_size
         self._precompile_select_from_array_helper(
             name="select all logits",
             source_paddings=self.runner.num_tokens_paddings,
             indices_paddings=index_paddings,
             hidden_dim=hsize,
-            input_sharding=dp_sharding,
+            input_sharding=hidden_states_sharding,
             indices_sharding=dp_sharding if dp_size > 1 else None,
         )

tpu_inference/runner/kv_cache.py CHANGED Viewed

@@ -9,7 +9,7 @@ from torchax.ops.mappings import t2j_dtype
 import tpu_inference.kernels.ragged_paged_attention.v3.kernel as rpa
 import tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 as rpa_hd64
-from tpu_inference.layers.jax.sharding import ShardingAxisName
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)

tpu_inference/runner/tpu_runner.py CHANGED Viewed

@@ -27,7 +27,7 @@ from vllm.v1.core.sched.output import GrammarOutput
 from vllm.v1.core.sched.output import SchedulerOutput as VllmSchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
-                             DraftTokenIds, KVConnectorOutput,
+                             DraftTokenIds, KVConnectorOutput, LogprobsLists,
                              ModelRunnerOutput)
 from vllm.v1.request import Request
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -37,15 +37,15 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from tpu_inference import utils as common_utils
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.sharding import (MESH_AXIS_NAMES,
+                                                  MESH_AXIS_NAMES_2D,
+                                                  ShardingAxisName,
+                                                  ShardingConfigManager)
 from tpu_inference.layers.jax.sample.rejection_sampler import RejectionSampler
 from tpu_inference.layers.jax.sample.sampling import (compute_logprobs,
                                                       gather_logprobs, sample)
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata
-from tpu_inference.layers.jax.sharding import (MESH_AXIS_NAMES,
-                                               MESH_AXIS_NAMES_2D,
-                                               ShardingAxisName,
-                                               ShardingConfigManager)
 from tpu_inference.logger import init_logger
 from tpu_inference.models.common.model_loader import get_model
 from tpu_inference.models.jax.utils.weight_utils import (
@@ -190,6 +190,21 @@ def _substitute_placeholder_token(
     return input_ids.at[token_in_tpu_cur_input_indices].set(update_values)
+def _reorder_logits_indices(logprobs_lists, logits_indices_selector):
+    return LogprobsLists(
+        logprob_token_ids=[
+            logprobs_lists.logprob_token_ids[i]
+            for i in logits_indices_selector
+        ],
+        logprobs=[logprobs_lists.logprobs[i] for i in logits_indices_selector],
+        sampled_token_ranks=[
+            logprobs_lists.sampled_token_ranks[i]
+            for i in logits_indices_selector
+        ],
+        cu_num_generated_tokens=logprobs_lists.cu_num_generated_tokens,
+    )
 class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
     def __init__(
@@ -840,7 +855,12 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
                     logits_indices_selector)
             if logprobs is not None:
+                # Map logprobs back to the pre-dp shuffling order
                 logprobs_lists = logprobs.tolists()
+                if logits_indices_selector is not None:
+                    logprobs_lists = _reorder_logits_indices(
+                        logprobs_lists, logits_indices_selector)
             else:
                 logprobs_lists = None
@@ -908,7 +928,11 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
             req_state.output_token_ids.extend(sampled_ids)
         if logprobs is not None:
+            # Map logprobs back to the pre-dp shuffling order
             logprobs_lists = logprobs.tolists()
+            if logits_indices_selector is not None:
+                logprobs_lists = _reorder_logits_indices(
+                    logprobs_lists, logits_indices_selector)
         else:
             logprobs_lists = None
@@ -1315,10 +1339,10 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         seq_lens_cpu = seq_lens
         (input_ids, positions, block_tables, query_start_loc, seq_lens,
-         logits_indices, request_distribution, logits_indices) = device_array(
+         logits_indices, request_distribution) = device_array(
              self.mesh,
              (input_ids, positions, block_tables, query_start_loc, seq_lens,
-              logits_indices, request_distribution, logits_indices),
+              logits_indices, request_distribution),
              sharding=data_parallel_attn_sharding,
          )
         # Async scheduling: substitute placeholder tokens for DP

tpu_inference/utils.py CHANGED Viewed

@@ -132,8 +132,8 @@ def pathways_hbm_usage_gb(devices: Any) -> List[Tuple[float, float]]:
     hbm_used = defaultdict(int)
     hbm_limit = get_device_hbm_limit()
     for array in live_arrays:
-        for buffer in array.device_buffers:
-            hbm_used[buffer.device] += buffer.nbytes
+        for buffer in array.addressable_shards:
+            hbm_used[buffer.data.device] += buffer.data.nbytes
     return [(hbm_used[device], hbm_limit) for device in devices]

tpu_inference/worker/tpu_worker.py CHANGED Viewed

@@ -25,7 +25,7 @@ from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from tpu_inference import envs, utils
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_transfer_port,
                                              get_node_id)
-from tpu_inference.layers.jax.sharding import ShardingConfigManager
+from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
 from tpu_inference.runner.kv_cache import get_rpa_page_size_bytes
 from tpu_inference.runner.tpu_runner import TPUModelRunner

tpu-inference 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl