PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tpu_inference/layers/{common → jax}/attention_interface.py RENAMED Viewed

@@ -17,7 +17,7 @@ import tpu_inference.kernels.ragged_paged_attention.v3.kernel as rpa
 import tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 as rpa_hd64
 from tpu_inference.kernels.flash_attention.kernel import flash_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.utils import get_megacore
 MAX_ALLOWED_PAGE_INDICES_N = (
@@ -308,13 +308,7 @@ def sharded_ragged_paged_attention(
     args = (q, k, v, kv_cache, kv_lens, page_indices, cu_q_lens, distribution)
     use_hd64 = q.shape[-1] == 64
-    func = ragged_paged_attention
-    if use_hd64:
-        func = functools.partial(ragged_paged_attention_hd64,
-                                 strict_sliding_window=False)
-    else:
-        func = ragged_paged_attention
+    func = ragged_paged_attention_hd64 if use_hd64 else ragged_paged_attention
     if attention_sink is not None:
         if not use_hd64:

tpu_inference/layers/jax/sample/rejection_sampler.py CHANGED Viewed

@@ -12,7 +12,7 @@ import jax
 import jax.numpy as jnp
 import numpy as np
-from tpu_inference.layers.common.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.jax.binary_search import topk_mask, topp_mask
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata

tpu_inference/layers/jax/sample/sampling.py CHANGED Viewed

@@ -6,10 +6,10 @@ from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 from vllm.v1.outputs import LogprobsTensors
-from tpu_inference.layers.common.binary_search import topk_mask, topp_mask
-from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.jax.binary_search import topk_mask, topp_mask
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata
+from tpu_inference.layers.jax.sharding import ShardingAxisName
 _SAMPLING_EPS = 1e-5

tpu_inference/layers/{common → jax}/sharding.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import json
 import math
+import os
 from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING, List, Optional
@@ -7,7 +8,7 @@ import jax.numpy as jnp
 import numpy as np
 from jax.sharding import Mesh
-from tpu_inference import envs, utils
+from tpu_inference import utils
 if TYPE_CHECKING:
     from vllm.v1.configs.vllm_config import VllmConfig
@@ -47,7 +48,7 @@ class ShardingAxisName2D:
 try:
-    _use_base_sharding = envs.NEW_MODEL_DESIGN
+    _use_base_sharding = os.getenv("NEW_MODEL_DESIGN", False)
     if _use_base_sharding:
         ShardingAxisName = ShardingAxisNameBase
     else:
@@ -165,10 +166,9 @@ class ShardingConfigManager:
                     f"LoRA is not supported with data parallelism "
                     f"(DP size: {total_dp_size}). Please disable LoRA or "
                     f"set data parallelism to 1.")
-        if sharding_strategy.attention_data_parallelism > 1:
-            if not envs.NEW_MODEL_DESIGN:
+            if not os.environ.get("NEW_MODEL_DESIGN", False):
                 raise ValueError(
-                    "Must run Attention DP with NEW_MODEL_DESIGN enabled. Please set the "
+                    "Must run DP with NEW_MODEL_DESIGN enabled. Please set the "
                     "NEW_MODEL_DESIGN=True.")
     @property

tpu_inference/layers/vllm/attention.py CHANGED Viewed

@@ -13,8 +13,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from tpu_inference import utils
-from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.logger import init_logger
 from tpu_inference.models.vllm.vllm_model_wrapper_context import \
     get_vllm_model_wrapper_context

tpu_inference/layers/vllm/fused_moe.py CHANGED Viewed

@@ -2,16 +2,17 @@ import functools
 import jax
 from jax import numpy as jnp
-from jax import shard_map
 from jax.experimental.pallas.ops.tpu.megablox.gmm import gmm
-from jax.sharding import Mesh
-from jax.sharding import PartitionSpec as P
+from jax.experimental.shard_map import shard_map
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from tpu_inference.layers.vllm.linear_common import \
     slice_sharded_tensor_for_concatenation
+P = PartitionSpec
-def activation_fn(activation: str, x1: jax.Array, x2: jax.Array) -> jax.Array:
+def activation_fn(activation: str, x1, x2):
     match activation:
         case "silu":
             return jax.nn.silu(x1) * x2
@@ -22,10 +23,7 @@ def activation_fn(activation: str, x1: jax.Array, x2: jax.Array) -> jax.Array:
                 f"FusedMoE does not support {activation} activation")
-def _swigluoai(x1: jax.Array,
-               x2: jax.Array,
-               alpha=1.702,
-               limit=7.0) -> jax.Array:
+def _swigluoai(x1, x2, alpha=1.702, limit=7.0):
     x1 = jnp.clip(x1, a_max=limit)
     x2 = jnp.clip(x2, a_min=-limit, a_max=limit)
@@ -105,53 +103,40 @@ def tensor_sharded_gmm_merged_column_parallel(
     rhs: jax.Array,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
+    transpose_rhs: bool,
     mesh: Mesh,
-) -> tuple[jax.Array, jax.Array]:
-    def _gmm(lhs, rhs, group_sizes):
-        m, g, n, k = lhs.shape[0], *rhs.shape
-        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-        return gmm(
-            lhs,
-            rhs,
-            group_sizes,
-            preferred_element_type=lhs.dtype,
-            tiling=(tm, tk, tn),
-            transpose_rhs=True,
-            group_offset=jnp.array(0),
-        )
+    intermediate_size: int,
+) -> jax.Array:
+    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
+    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
+    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    _gmm = functools.partial(
+        gmm,
+        preferred_element_type=lhs.dtype,
+        tiling=(tm, tk, tn),
+        transpose_rhs=transpose_rhs,
+        group_offset=jnp.array(0),
+    )
     gmm_result = shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P("data", None), P(None, "model", None), P("data")),
-        out_specs=(P("data", "model")),
-        check_vma=False,
+        in_specs=(P(), P(None, "model", None), P()),
+        out_specs=(P(None, "model")),
+        check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
+        rhs_bis = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
+        gmm_result = (gmm_result + rhs_bis).astype(gmm_result.dtype)
-        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
-            rhs_bias = jnp.repeat(
-                rhs_bias_local,
-                group_sizes_global,
-                0,
-                total_repeat_length=gmm_result_local.shape[0])
-            return gmm_result_local + rhs_bias
-        gmm_result = shard_map(
-            _add_bias,
-            mesh=mesh,
-            in_specs=(P("data", "model"), P(None, "model"), P("data")),
-            out_specs=(P("data", "model")),
-        )(gmm_result, rhs_bias, group_sizes)
-    gmm_result = gmm_result.astype(lhs.dtype)
-    tp_size = mesh.shape["model"]
-    intermediate_size = gmm_result.shape[-1] // 2
+    n_shards = mesh.shape["model"]
     output_sizes = [intermediate_size, intermediate_size]
     return slice_sharded_tensor_for_concatenation(gmm_result, output_sizes,
-                                                  tp_size)
+                                                  n_shards)
 def tensor_sharded_gmm_row_parallel(
@@ -159,75 +144,74 @@ def tensor_sharded_gmm_row_parallel(
     rhs: jax.Array,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
+    transpose_rhs: bool,
     mesh: Mesh,
 ) -> jax.Array:
+    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
+    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
+    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    _gmm = functools.partial(
+        gmm,
+        preferred_element_type=lhs.dtype,
+        tiling=(tm, tk, tn),
+        transpose_rhs=transpose_rhs,
+        group_offset=jnp.array(0),
+    )
     def _gmm_all_reduce(lhs, rhs, group_sizes):
-        m, g, n, k = lhs.shape[0], *rhs.shape
-        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-        out = gmm(
-            lhs,
-            rhs,
-            group_sizes,
-            preferred_element_type=lhs.dtype,
-            tiling=(tm, tk, tn),
-            transpose_rhs=True,
-            group_offset=jnp.array(0),
-        )
-        return jax.lax.psum(out, axis_name="model")
+        r = _gmm(lhs, rhs, group_sizes)
+        return jax.lax.psum(r, axis_name="model")
     gmm_result = shard_map(
         _gmm_all_reduce,
         mesh=mesh,
-        in_specs=(P("data", "model"), P(None, None, "model"), P("data")),
-        out_specs=(P("data")),
-        check_vma=False,
+        in_specs=(P(None, "model"), P(None, None, "model"), P()),
+        out_specs=(P()),
+        check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
+        rhs_bias = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
+        gmm_result = (gmm_result + rhs_bias).astype(gmm_result.dtype)
-        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
-            rhs_bias = jnp.repeat(
-                rhs_bias_local,
-                group_sizes_global,
-                0,
-                total_repeat_length=gmm_result_local.shape[0])
-            return gmm_result_local + rhs_bias
-        gmm_result = shard_map(
-            _add_bias,
-            mesh=mesh,
-            in_specs=(P("data"), P(), P("data")),
-            out_specs=(P("data")),
-        )(gmm_result, rhs_bias, group_sizes)
-    return gmm_result.astype(lhs.dtype)
+    return gmm_result
 def expert_sharded_gmm(
     lhs: jax.Array,
     rhs: jax.Array,
     group_sizes: jax.Array,
+    transpose_rhs: bool,
     mesh: Mesh,
+    num_experts: int,
+    ep_size: int,
 ) -> jax.Array:
-    ep_size = mesh.shape["model"]
+    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
+    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
+    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-    num_experts = rhs.shape[0]
     num_experts_per_shard = num_experts // ep_size
     group_offset = jnp.arange(0, num_experts, num_experts_per_shard)
+    group_offset = jax.lax.with_sharding_constraint(
+        group_offset, NamedSharding(mesh, P("model")))
     def _gmm(lhs, rhs, group_sizes, group_offset):
-        m, g, n, k = lhs.shape[0], *rhs.shape
-        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+        # Group offset for this shard. `group_offset` is sharded, and in this
+        # sharded function, it has only 1 element and `group_offset.shape` is
+        # (1,) but gmm kernel requires the group_offset to be a ()-shaped array,
+        # so we group_offset[0].
+        group_offset_of_shard = group_offset[0]
         gmm_res = gmm(
             lhs=lhs,
             rhs=rhs,
             group_sizes=group_sizes,
             preferred_element_type=lhs.dtype,
             tiling=(tm, tk, tn),
-            transpose_rhs=True,
-            group_offset=group_offset[0],
+            transpose_rhs=transpose_rhs,
+            group_offset=group_offset_of_shard,
         )
         return gmm_res
@@ -254,24 +238,30 @@ def expert_sharded_gmm(
         mesh=mesh,
         in_specs=(P(), P("model", None, None), P(), P("model")),
         out_specs=(P("model", None)),
-        check_vma=False,
+        check_rep=False,
     )(lhs, rhs, group_sizes, group_offset)
     # For i-th shard, it is responsible groups (AKA experts) from
     # i*num_experts_per_shard to (i+1)*num_experts_per_shard We sum them up to
     # get total rows in that shard, and that is the size for shard to send to
     # its peers. This is also the number of non-zero rows from the gmm results.
-    # In the working example, send_sizes would be [3, 2, 5, 4].
-    # group_sizes has shape of [num_tokens_per_shard * num_experts_per_shard].
-    # So reshaping to [num_tokens_per_shard, num_experts_per_shard] and applying
-    # sum(axis=1) will get desired send_sizes shaped [num_tokens_per_shard].
-    send_sizes = group_sizes.reshape(-1, num_experts_per_shard).sum(axis=1)
+    # In the working example, send_sizes would be [3, 2, 5, 4]
+    send_sizes = jnp.array([
+        group_sizes[i * num_experts_per_shard:(i + 1) *
+                    num_experts_per_shard].sum() for i in range(ep_size)
+    ])
     # In the working example, input_offsets would be [0, 3, 5, 10]
     input_offsets = jnp.concatenate((jnp.array([0]), send_sizes.cumsum()[:-1]))
     output_offsets = input_offsets
     recv_sizes = send_sizes
+    input_offsets = jax.lax.with_sharding_constraint(
+        input_offsets, NamedSharding(mesh, P("model")))
+    send_sizes = jax.lax.with_sharding_constraint(
+        send_sizes, NamedSharding(mesh, P("model")))
+    output_offsets = jax.lax.with_sharding_constraint(
+        output_offsets, NamedSharding(mesh, P("model")))
     def _ragged_all_to_all(operand, input_offsets, send_sizes, output_offsets,
                            recv_sizes):
         output = jnp.zeros_like(operand)
@@ -326,20 +316,10 @@ def expert_sharded_gmm(
         mesh=mesh,
         in_specs=(P("model", None), P("model"), P("model"), P("model"), P()),
         out_specs=(P()),
-        check_vma=False,
+        check_rep=False,
     )(gmm_res, input_offsets, send_sizes, output_offsets, recv_sizes)
-@functools.partial(
-    jax.jit,
-    static_argnames=(
-        "topk",
-        "renormalize",
-        "mesh",
-        "use_ep",
-        "activation",
-    ),
-)
 def fused_moe_func(
     hidden_states: jax.Array,
     w1: jax.Array,
@@ -348,45 +328,37 @@ def fused_moe_func(
     w2_bias: jax.Array | None,
     gating_output: jax.Array,
     topk: int,
+    global_num_experts: int,
     renormalize: bool,
+    reduce_results: bool,
     mesh: Mesh,
     use_ep: bool,
     activation: str,
-) -> jax.Array:
+):
     """
-    Route tokens in hidden_states into each experts based on routing
-    information in gating_out and performs moe with w1 and w2 weights.
     Args:
-        hidden_states: [num_tokens, hidden_size]
-        w1: first moe weights [num_experts, intermediate_size * 2, hidden_size]
-        w2: second moe weights [num_experts, hidden_size, intermediate_size]
-        w1_bias: optional bias of w1 [num_experts, intermediate_size * 2]
-        w2_bias: optional bias of w2 [num_experts, hidden_size]
-        gating_output: routing information of tokens [num_tokens, num_experts]
-        topk: number of experts to choose per token.
-        renormalize: normalize gating_output.
-        mesh: mesh to perform moe.
-        use_ep: use expert parallelism.
-        activation: activation function to perform on the output of w1.
-    Returns:
-        Output of moe operation [num_tokens, hidden_size]
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
     """
+    # adapted from https://github.com/vllm-project/vllm/blob/29fa5cac1cd731026f59084d93a822921507573c/vllm/model_executor/layers/fused_moe/moe_pallas.py#L26
     if use_ep and (w1_bias is not None or w2_bias is not None):
         raise NotImplementedError(
             "Bias is not supported when using expert parallelism.")
-    num_tokens = hidden_states.shape[0]
-    global_num_experts, hidden_size, intermediate_size = w2.shape
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.size // hidden_size
+    assert global_num_experts == w1.shape[0]
+    ep_size = mesh.shape["model"]  # only used if use_ep is True.
+    intermediate_size = w2.shape[-1]
     dtype = hidden_states.dtype
     assert (num_tokens * topk) % 16 == 0, (
         "The kernel requires num_tokens * topk to be a multiple of "
         f"16 but got {num_tokens}*{topk}={num_tokens*topk}")
-    assert hidden_states.shape == (num_tokens, hidden_size)
-    assert gating_output.shape == (num_tokens, global_num_experts)
-    assert w1.shape == (global_num_experts, intermediate_size * 2, hidden_size)
+    hidden_states = hidden_states.reshape(num_tokens, hidden_size)
+    gating_output = gating_output.reshape(num_tokens, global_num_experts)
     topk_weights = jax.nn.softmax(gating_output.astype(jnp.float32), axis=-1)
     topk_weights, topk_indices = jax.lax.top_k(topk_weights, k=topk)
@@ -394,76 +366,142 @@ def fused_moe_func(
         topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdims=True)
     topk_weights = topk_weights.astype(dtype)
-    def _process_tokens_locally(hidden_states_local, topk_indices_local):
-        num_tokens_local = hidden_states_local.shape[0]
-        topk_indices_flat = topk_indices_local.flatten()
-        topk_argsort_indices = jnp.argsort(topk_indices_flat)
-        topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
-        token_indices = jnp.arange(num_tokens_local,
-                                   dtype=jnp.int32).repeat(topk)
-        token_indices_sorted = token_indices[topk_argsort_indices]
-        group_sizes_local = jnp.bincount(topk_indices_flat,
-                                         length=global_num_experts)
-        x = hidden_states_local[token_indices_sorted]
-        return x, group_sizes_local, topk_argsort_revert_indices
-    x, group_sizes, topk_argsort_revert_indices = shard_map(
-        _process_tokens_locally,
-        mesh=mesh,
-        in_specs=(P("data", None), P("data", None)),
-        out_specs=(P("data", None), P("data"), P("data")),
-    )(hidden_states, topk_indices)
+    topk_indices_flat = topk_indices.flatten()
+    topk_argsort_indices = jnp.argsort(topk_indices_flat)
+    topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
+    token_indices = jnp.arange(num_tokens, dtype=jnp.int32).repeat(topk)
+    token_indices_sorted = token_indices[topk_argsort_indices]
+    group_sizes = jnp.bincount(topk_indices_flat, length=global_num_experts)
+    x = hidden_states[token_indices_sorted]
     if use_ep:
         x = expert_sharded_gmm(
             x,
             w1,
             group_sizes,
+            transpose_rhs=True,
             mesh=mesh,
+            num_experts=global_num_experts,
+            ep_size=ep_size,
         )
-        x1, x2 = jnp.split(x, 2, -1)
-        x = activation_fn(activation, x1, x2)
-        x = expert_sharded_gmm(
-            x,
-            w2,
-            group_sizes,
-            mesh=mesh,
-        )
+        x1, x2 = x[..., :intermediate_size], x[..., intermediate_size:]
     else:
         x1, x2 = tensor_sharded_gmm_merged_column_parallel(
             x,
             w1,
             w1_bias,
             group_sizes,
+            transpose_rhs=True,
             mesh=mesh,
+            intermediate_size=intermediate_size,
         )
-        x = activation_fn(activation, x1, x2)
+    x = activation_fn(activation, x1, x2)
+    if use_ep:
+        x = expert_sharded_gmm(
+            x,
+            w2,
+            group_sizes,
+            transpose_rhs=True,
+            mesh=mesh,
+            num_experts=global_num_experts,
+            ep_size=ep_size,
+        )
+    else:
+        x = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P(None, "model")))
         x = tensor_sharded_gmm_row_parallel(
             x,
             w2,
             w2_bias,
             group_sizes,
+            transpose_rhs=True,
             mesh=mesh,
         )
-    def _finalize_output(x_local, topk_argsort_revert_indices_local,
-                         topk_weights_local):
-        x_local = x_local[topk_argsort_revert_indices_local].reshape(
-            -1, topk, hidden_size)
-        x_local = x_local * jnp.expand_dims(topk_weights_local, axis=-1)
-        x_local = x_local.sum(axis=-2)
-        return x_local
+    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
+    x = x * jnp.expand_dims(topk_weights, axis=-1)
+    x = x.sum(axis=-2)
+    x = x.reshape(orig_shape)
-    x = shard_map(
-        _finalize_output,
-        mesh=mesh,
-        in_specs=(P("data", None), P("data"), P("data", None)),
-        out_specs=(P("data", None)),
-    )(x, topk_argsort_revert_indices, topk_weights)
+    if reduce_results:
+        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P()))
+    return x
-    return x[:num_tokens, :hidden_size]
+@functools.partial(
+    jax.jit,
+    static_argnames=(
+        "topk",
+        "global_num_experts",
+        "renormalize",
+        "reduce_results",
+        "mesh",
+        "use_ep",
+        "activation",
+    ),
+)
+def fused_moe_func_padded(
+    hidden_states: jax.Array,
+    w1: jax.Array,
+    w2: jax.Array,
+    w1_bias: jax.Array | None,
+    w2_bias: jax.Array | None,
+    gating_output: jax.Array,
+    topk: int,
+    global_num_experts: int,
+    renormalize: bool,
+    reduce_results: bool,
+    mesh: Mesh,
+    use_ep: bool,
+    activation: str,
+):
+    # TODO(fanhongmin@google.com): Once the jax runner pads the input, we no longer need this.
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.size // hidden_size
+    if num_tokens * topk < 16:
+        assert 16 % (num_tokens *
+                     topk) == 0, f"Cannot pad to 16: {num_tokens=}, {topk=}"
+        n_repeats = 16 // (num_tokens * topk)
+        reps = (n_repeats, ) + (1, ) * (hidden_states.ndim - 1)
+        expanded_hidden_states = jnp.tile(hidden_states, reps)
+        reps = (n_repeats, ) + (1, ) * (gating_output.ndim - 1)
+        expanded_gating_output = jnp.tile(gating_output, reps)
+        expanded_x = fused_moe_func(
+            expanded_hidden_states,
+            w1,
+            w2,
+            w1_bias,
+            w2_bias,
+            expanded_gating_output,
+            topk,
+            global_num_experts,
+            renormalize,
+            reduce_results,
+            mesh,
+            use_ep,
+            activation,
+        )
+        x = expanded_x[:hidden_states.shape[0]]
+        return x
+    else:
+        return fused_moe_func(
+            hidden_states,
+            w1,
+            w2,
+            w1_bias,
+            w2_bias,
+            gating_output,
+            topk,
+            global_num_experts,
+            renormalize,
+            reduce_results,
+            mesh,
+            use_ep,
+            activation,
+        )

tpu_inference/layers/vllm/quantization/__init__.py CHANGED Viewed

@@ -5,12 +5,10 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizationConfig
-from tpu_inference.layers.common import quant_methods
 from tpu_inference.layers.vllm.quantization.awq import VllmAWQConfig
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
     VllmCompressedTensorsConfig  # noqa: E501
-from tpu_inference.layers.vllm.quantization.mxfp4 import VllmMxfp4Config
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedConfig
@@ -21,9 +19,8 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
     # TODO(kyuyeunk): Add support for "tpu_int8".
     method_to_config: dict[str, str] = {
         None: VllmUnquantizedConfig,
-        quant_methods.COMPRESSED_TENSORS: VllmCompressedTensorsConfig,
-        quant_methods.AWQ: VllmAWQConfig,
-        quant_methods.MXFP4: VllmMxfp4Config,
+        "compressed-tensors": VllmCompressedTensorsConfig,
+        "awq": VllmAWQConfig,
     }
     if model_config.quantization not in method_to_config:
         raise NotImplementedError(
@@ -33,7 +30,6 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
     assert issubclass(quant_config, JaxCommonConfig)
     quant_config.set_configs(vllm_config, mesh)
-    model_config.quantization = quant_methods.get_tpu_quant_method(
-        quant_config.get_name())
+    model_config.quantization = quant_config.get_name()
     return VllmConfig.get_quantization_config(model_config,
                                               vllm_config.load_config)

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl