PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511180814__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511180814py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (76) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -7
tests/lora/test_lora_perf.py +53 -0
tests/lora/utils.py +0 -8
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/__init__.py +22 -3
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +3 -4
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +93 -9
tpu_inference/executors/ray_distributed_executor.py +9 -2
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +140 -67
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +204 -120
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +84 -28
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama3.py +2 -1
tpu_inference/models/jax/llama_eagle3.py +8 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +2 -1
tpu_inference/models/jax/qwen2_5_vl.py +163 -48
tpu_inference/models/jax/qwen3.py +2 -1
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +205 -144
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -8
tpu_inference/platforms/tpu_platform.py +34 -50
tpu_inference/runner/compilation_manager.py +144 -60
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +48 -33
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +280 -149
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -21
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +46 -18
tpu_inference/worker/tpu_worker.py +197 -63
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +9 -10
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +70 -74
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/fused_moe.py CHANGED Viewed

@@ -2,17 +2,16 @@ import functools
 import jax
 from jax import numpy as jnp
+from jax import shard_map
 from jax.experimental.pallas.ops.tpu.megablox.gmm import gmm
-from jax.experimental.shard_map import shard_map
-from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from jax.sharding import Mesh
+from jax.sharding import PartitionSpec as P
 from tpu_inference.layers.vllm.linear_common import \
     slice_sharded_tensor_for_concatenation
-P = PartitionSpec
-def activation_fn(activation: str, x1, x2):
+def activation_fn(activation: str, x1: jax.Array, x2: jax.Array) -> jax.Array:
     match activation:
         case "silu":
             return jax.nn.silu(x1) * x2
@@ -23,7 +22,10 @@ def activation_fn(activation: str, x1, x2):
                 f"FusedMoE does not support {activation} activation")
-def _swigluoai(x1, x2, alpha=1.702, limit=7.0):
+def _swigluoai(x1: jax.Array,
+               x2: jax.Array,
+               alpha=1.702,
+               limit=7.0) -> jax.Array:
     x1 = jnp.clip(x1, a_max=limit)
     x2 = jnp.clip(x2, a_min=-limit, a_max=limit)
@@ -103,40 +105,53 @@ def tensor_sharded_gmm_merged_column_parallel(
     rhs: jax.Array,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
-    transpose_rhs: bool,
     mesh: Mesh,
-    intermediate_size: int,
-) -> jax.Array:
-    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
-    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
-    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-    _gmm = functools.partial(
-        gmm,
-        preferred_element_type=lhs.dtype,
-        tiling=(tm, tk, tn),
-        transpose_rhs=transpose_rhs,
-        group_offset=jnp.array(0),
-    )
+) -> tuple[jax.Array, jax.Array]:
+    def _gmm(lhs, rhs, group_sizes):
+        m, g, n, k = lhs.shape[0], *rhs.shape
+        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+        return gmm(
+            lhs,
+            rhs,
+            group_sizes,
+            preferred_element_type=lhs.dtype,
+            tiling=(tm, tk, tn),
+            transpose_rhs=True,
+            group_offset=jnp.array(0),
+        )
     gmm_result = shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P(), P(None, "model", None), P()),
-        out_specs=(P(None, "model")),
-        check_rep=False,
+        in_specs=(P("data", None), P(None, "model", None), P("data")),
+        out_specs=(P("data", "model")),
+        check_vma=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
-        rhs_bis = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
-        gmm_result = (gmm_result + rhs_bis).astype(gmm_result.dtype)
-    n_shards = mesh.shape["model"]
-    output_sizes = [intermediate_size, intermediate_size]
+        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
+            rhs_bias = jnp.repeat(
+                rhs_bias_local,
+                group_sizes_global,
+                0,
+                total_repeat_length=gmm_result_local.shape[0])
+            return gmm_result_local + rhs_bias
+        gmm_result = shard_map(
+            _add_bias,
+            mesh=mesh,
+            in_specs=(P("data", "model"), P(None, "model"), P("data")),
+            out_specs=(P("data", "model")),
+        )(gmm_result, rhs_bias, group_sizes)
+    gmm_result = gmm_result.astype(lhs.dtype)
+    tp_size = mesh.shape["model"]
+    intermediate_size = gmm_result.shape[-1] // 2
+    output_sizes = [intermediate_size, intermediate_size]
     return slice_sharded_tensor_for_concatenation(gmm_result, output_sizes,
-                                                  n_shards)
+                                                  tp_size)
 def tensor_sharded_gmm_row_parallel(
@@ -144,74 +159,75 @@ def tensor_sharded_gmm_row_parallel(
     rhs: jax.Array,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
-    transpose_rhs: bool,
     mesh: Mesh,
 ) -> jax.Array:
-    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
-    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
-    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-    _gmm = functools.partial(
-        gmm,
-        preferred_element_type=lhs.dtype,
-        tiling=(tm, tk, tn),
-        transpose_rhs=transpose_rhs,
-        group_offset=jnp.array(0),
-    )
     def _gmm_all_reduce(lhs, rhs, group_sizes):
-        r = _gmm(lhs, rhs, group_sizes)
-        return jax.lax.psum(r, axis_name="model")
+        m, g, n, k = lhs.shape[0], *rhs.shape
+        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+        out = gmm(
+            lhs,
+            rhs,
+            group_sizes,
+            preferred_element_type=lhs.dtype,
+            tiling=(tm, tk, tn),
+            transpose_rhs=True,
+            group_offset=jnp.array(0),
+        )
+        return jax.lax.psum(out, axis_name="model")
     gmm_result = shard_map(
         _gmm_all_reduce,
         mesh=mesh,
-        in_specs=(P(None, "model"), P(None, None, "model"), P()),
-        out_specs=(P()),
-        check_rep=False,
+        in_specs=(P("data", "model"), P(None, None, "model"), P("data")),
+        out_specs=(P("data")),
+        check_vma=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
-        rhs_bias = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
-        gmm_result = (gmm_result + rhs_bias).astype(gmm_result.dtype)
-    return gmm_result
+        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
+            rhs_bias = jnp.repeat(
+                rhs_bias_local,
+                group_sizes_global,
+                0,
+                total_repeat_length=gmm_result_local.shape[0])
+            return gmm_result_local + rhs_bias
+        gmm_result = shard_map(
+            _add_bias,
+            mesh=mesh,
+            in_specs=(P("data"), P(), P("data")),
+            out_specs=(P("data")),
+        )(gmm_result, rhs_bias, group_sizes)
+    return gmm_result.astype(lhs.dtype)
 def expert_sharded_gmm(
     lhs: jax.Array,
     rhs: jax.Array,
     group_sizes: jax.Array,
-    transpose_rhs: bool,
     mesh: Mesh,
-    num_experts: int,
-    ep_size: int,
 ) -> jax.Array:
-    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
-    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
-    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    ep_size = mesh.shape["model"]
+    num_experts = rhs.shape[0]
     num_experts_per_shard = num_experts // ep_size
     group_offset = jnp.arange(0, num_experts, num_experts_per_shard)
-    group_offset = jax.lax.with_sharding_constraint(
-        group_offset, NamedSharding(mesh, P("model")))
     def _gmm(lhs, rhs, group_sizes, group_offset):
-        # Group offset for this shard. `group_offset` is sharded, and in this
-        # sharded function, it has only 1 element and `group_offset.shape` is
-        # (1,) but gmm kernel requires the group_offset to be a ()-shaped array,
-        # so we group_offset[0].
-        group_offset_of_shard = group_offset[0]
+        m, g, n, k = lhs.shape[0], *rhs.shape
+        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
         gmm_res = gmm(
             lhs=lhs,
             rhs=rhs,
             group_sizes=group_sizes,
             preferred_element_type=lhs.dtype,
             tiling=(tm, tk, tn),
-            transpose_rhs=transpose_rhs,
-            group_offset=group_offset_of_shard,
+            transpose_rhs=True,
+            group_offset=group_offset[0],
         )
         return gmm_res
@@ -238,30 +254,24 @@ def expert_sharded_gmm(
         mesh=mesh,
         in_specs=(P(), P("model", None, None), P(), P("model")),
         out_specs=(P("model", None)),
-        check_rep=False,
+        check_vma=False,
     )(lhs, rhs, group_sizes, group_offset)
     # For i-th shard, it is responsible groups (AKA experts) from
     # i*num_experts_per_shard to (i+1)*num_experts_per_shard We sum them up to
     # get total rows in that shard, and that is the size for shard to send to
     # its peers. This is also the number of non-zero rows from the gmm results.
-    # In the working example, send_sizes would be [3, 2, 5, 4]
-    send_sizes = jnp.array([
-        group_sizes[i * num_experts_per_shard:(i + 1) *
-                    num_experts_per_shard].sum() for i in range(ep_size)
-    ])
+    # In the working example, send_sizes would be [3, 2, 5, 4].
+    # group_sizes has shape of [num_tokens_per_shard * num_experts_per_shard].
+    # So reshaping to [num_tokens_per_shard, num_experts_per_shard] and applying
+    # sum(axis=1) will get desired send_sizes shaped [num_tokens_per_shard].
+    send_sizes = group_sizes.reshape(-1, num_experts_per_shard).sum(axis=1)
     # In the working example, input_offsets would be [0, 3, 5, 10]
     input_offsets = jnp.concatenate((jnp.array([0]), send_sizes.cumsum()[:-1]))
     output_offsets = input_offsets
     recv_sizes = send_sizes
-    input_offsets = jax.lax.with_sharding_constraint(
-        input_offsets, NamedSharding(mesh, P("model")))
-    send_sizes = jax.lax.with_sharding_constraint(
-        send_sizes, NamedSharding(mesh, P("model")))
-    output_offsets = jax.lax.with_sharding_constraint(
-        output_offsets, NamedSharding(mesh, P("model")))
     def _ragged_all_to_all(operand, input_offsets, send_sizes, output_offsets,
                            recv_sizes):
         output = jnp.zeros_like(operand)
@@ -316,10 +326,20 @@ def expert_sharded_gmm(
         mesh=mesh,
         in_specs=(P("model", None), P("model"), P("model"), P("model"), P()),
         out_specs=(P()),
-        check_rep=False,
+        check_vma=False,
     )(gmm_res, input_offsets, send_sizes, output_offsets, recv_sizes)
+@functools.partial(
+    jax.jit,
+    static_argnames=(
+        "topk",
+        "renormalize",
+        "mesh",
+        "use_ep",
+        "activation",
+    ),
+)
 def fused_moe_func(
     hidden_states: jax.Array,
     w1: jax.Array,
@@ -328,37 +348,45 @@ def fused_moe_func(
     w2_bias: jax.Array | None,
     gating_output: jax.Array,
     topk: int,
-    global_num_experts: int,
     renormalize: bool,
-    reduce_results: bool,
     mesh: Mesh,
     use_ep: bool,
     activation: str,
-):
+) -> jax.Array:
     """
+    Route tokens in hidden_states into each experts based on routing
+    information in gating_out and performs moe with w1 and w2 weights.
     Args:
-        hidden_states: [*, hidden_size]
-        w1: [num_experts, intermediate_size * 2, hidden_size]
-        w2: [num_experts, hidden_size, intermediate_size]
-        gating_output: [*, num_experts]
+        hidden_states: [num_tokens, hidden_size]
+        w1: first moe weights [num_experts, intermediate_size * 2, hidden_size]
+        w2: second moe weights [num_experts, hidden_size, intermediate_size]
+        w1_bias: optional bias of w1 [num_experts, intermediate_size * 2]
+        w2_bias: optional bias of w2 [num_experts, hidden_size]
+        gating_output: routing information of tokens [num_tokens, num_experts]
+        topk: number of experts to choose per token.
+        renormalize: normalize gating_output.
+        mesh: mesh to perform moe.
+        use_ep: use expert parallelism.
+        activation: activation function to perform on the output of w1.
+    Returns:
+        Output of moe operation [num_tokens, hidden_size]
     """
-    # adapted from https://github.com/vllm-project/vllm/blob/29fa5cac1cd731026f59084d93a822921507573c/vllm/model_executor/layers/fused_moe/moe_pallas.py#L26
     if use_ep and (w1_bias is not None or w2_bias is not None):
         raise NotImplementedError(
             "Bias is not supported when using expert parallelism.")
-    orig_shape = hidden_states.shape
-    hidden_size = hidden_states.shape[-1]
-    num_tokens = hidden_states.size // hidden_size
-    assert global_num_experts == w1.shape[0]
-    ep_size = mesh.shape["model"]  # only used if use_ep is True.
-    intermediate_size = w2.shape[-1]
+    num_tokens = hidden_states.shape[0]
+    global_num_experts, hidden_size, intermediate_size = w2.shape
     dtype = hidden_states.dtype
     assert (num_tokens * topk) % 16 == 0, (
         "The kernel requires num_tokens * topk to be a multiple of "
         f"16 but got {num_tokens}*{topk}={num_tokens*topk}")
-    hidden_states = hidden_states.reshape(num_tokens, hidden_size)
-    gating_output = gating_output.reshape(num_tokens, global_num_experts)
+    assert hidden_states.shape == (num_tokens, hidden_size)
+    assert gating_output.shape == (num_tokens, global_num_experts)
+    assert w1.shape == (global_num_experts, intermediate_size * 2, hidden_size)
     topk_weights = jax.nn.softmax(gating_output.astype(jnp.float32), axis=-1)
     topk_weights, topk_indices = jax.lax.top_k(topk_weights, k=topk)
@@ -366,142 +394,76 @@ def fused_moe_func(
         topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdims=True)
     topk_weights = topk_weights.astype(dtype)
-    topk_indices_flat = topk_indices.flatten()
-    topk_argsort_indices = jnp.argsort(topk_indices_flat)
-    topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
-    token_indices = jnp.arange(num_tokens, dtype=jnp.int32).repeat(topk)
-    token_indices_sorted = token_indices[topk_argsort_indices]
-    group_sizes = jnp.bincount(topk_indices_flat, length=global_num_experts)
-    x = hidden_states[token_indices_sorted]
+    def _process_tokens_locally(hidden_states_local, topk_indices_local):
+        num_tokens_local = hidden_states_local.shape[0]
+        topk_indices_flat = topk_indices_local.flatten()
+        topk_argsort_indices = jnp.argsort(topk_indices_flat)
+        topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
+        token_indices = jnp.arange(num_tokens_local,
+                                   dtype=jnp.int32).repeat(topk)
+        token_indices_sorted = token_indices[topk_argsort_indices]
+        group_sizes_local = jnp.bincount(topk_indices_flat,
+                                         length=global_num_experts)
+        x = hidden_states_local[token_indices_sorted]
+        return x, group_sizes_local, topk_argsort_revert_indices
+    x, group_sizes, topk_argsort_revert_indices = shard_map(
+        _process_tokens_locally,
+        mesh=mesh,
+        in_specs=(P("data", None), P("data", None)),
+        out_specs=(P("data", None), P("data"), P("data")),
+    )(hidden_states, topk_indices)
     if use_ep:
         x = expert_sharded_gmm(
             x,
             w1,
             group_sizes,
-            transpose_rhs=True,
             mesh=mesh,
-            num_experts=global_num_experts,
-            ep_size=ep_size,
         )
-        x1, x2 = x[..., :intermediate_size], x[..., intermediate_size:]
+        x1, x2 = jnp.split(x, 2, -1)
+        x = activation_fn(activation, x1, x2)
+        x = expert_sharded_gmm(
+            x,
+            w2,
+            group_sizes,
+            mesh=mesh,
+        )
     else:
         x1, x2 = tensor_sharded_gmm_merged_column_parallel(
             x,
             w1,
             w1_bias,
             group_sizes,
-            transpose_rhs=True,
             mesh=mesh,
-            intermediate_size=intermediate_size,
         )
-    x = activation_fn(activation, x1, x2)
+        x = activation_fn(activation, x1, x2)
-    if use_ep:
-        x = expert_sharded_gmm(
-            x,
-            w2,
-            group_sizes,
-            transpose_rhs=True,
-            mesh=mesh,
-            num_experts=global_num_experts,
-            ep_size=ep_size,
-        )
-    else:
-        x = jax.lax.with_sharding_constraint(
-            x, NamedSharding(mesh, P(None, "model")))
         x = tensor_sharded_gmm_row_parallel(
             x,
             w2,
             w2_bias,
             group_sizes,
-            transpose_rhs=True,
             mesh=mesh,
         )
-    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
-    x = x * jnp.expand_dims(topk_weights, axis=-1)
-    x = x.sum(axis=-2)
-    x = x.reshape(orig_shape)
-    if reduce_results:
-        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P()))
-    return x
+    def _finalize_output(x_local, topk_argsort_revert_indices_local,
+                         topk_weights_local):
+        x_local = x_local[topk_argsort_revert_indices_local].reshape(
+            -1, topk, hidden_size)
+        x_local = x_local * jnp.expand_dims(topk_weights_local, axis=-1)
+        x_local = x_local.sum(axis=-2)
+        return x_local
+    x = shard_map(
+        _finalize_output,
+        mesh=mesh,
+        in_specs=(P("data", None), P("data"), P("data", None)),
+        out_specs=(P("data", None)),
+    )(x, topk_argsort_revert_indices, topk_weights)
-@functools.partial(
-    jax.jit,
-    static_argnames=(
-        "topk",
-        "global_num_experts",
-        "renormalize",
-        "reduce_results",
-        "mesh",
-        "use_ep",
-        "activation",
-    ),
-)
-def fused_moe_func_padded(
-    hidden_states: jax.Array,
-    w1: jax.Array,
-    w2: jax.Array,
-    w1_bias: jax.Array | None,
-    w2_bias: jax.Array | None,
-    gating_output: jax.Array,
-    topk: int,
-    global_num_experts: int,
-    renormalize: bool,
-    reduce_results: bool,
-    mesh: Mesh,
-    use_ep: bool,
-    activation: str,
-):
-    # TODO(fanhongmin@google.com): Once the jax runner pads the input, we no longer need this.
-    hidden_size = hidden_states.shape[-1]
-    num_tokens = hidden_states.size // hidden_size
-    if num_tokens * topk < 16:
-        assert 16 % (num_tokens *
-                     topk) == 0, f"Cannot pad to 16: {num_tokens=}, {topk=}"
-        n_repeats = 16 // (num_tokens * topk)
-        reps = (n_repeats, ) + (1, ) * (hidden_states.ndim - 1)
-        expanded_hidden_states = jnp.tile(hidden_states, reps)
-        reps = (n_repeats, ) + (1, ) * (gating_output.ndim - 1)
-        expanded_gating_output = jnp.tile(gating_output, reps)
-        expanded_x = fused_moe_func(
-            expanded_hidden_states,
-            w1,
-            w2,
-            w1_bias,
-            w2_bias,
-            expanded_gating_output,
-            topk,
-            global_num_experts,
-            renormalize,
-            reduce_results,
-            mesh,
-            use_ep,
-            activation,
-        )
-        x = expanded_x[:hidden_states.shape[0]]
-        return x
-    else:
-        return fused_moe_func(
-            hidden_states,
-            w1,
-            w2,
-            w1_bias,
-            w2_bias,
-            gating_output,
-            topk,
-            global_num_experts,
-            renormalize,
-            reduce_results,
-            mesh,
-            use_ep,
-            activation,
-        )
+    return x[:num_tokens, :hidden_size]

tpu_inference/layers/vllm/linear_common.py CHANGED Viewed

@@ -9,30 +9,52 @@ from jax.sharding import PartitionSpec as P
 from torchax.interop import torch_view
 from torchax.ops.mappings import t2j
-from tpu_inference.kernels.quantized_matmul.kernel import \
-    quantized_matmul_kernel
+from tpu_inference import envs
+from tpu_inference.kernels.quantized_matmul.kernel import (
+    quantized_matmul_kernel, xla_quantized_matmul)
 def sharded_quantized_matmul(x: jax.Array, w_q: jax.Array, w_s: jax.Array,
-                             mesh: Mesh, weight_sharding: P):
-    out_axis, in_axis = weight_sharding
-    x_sharding = P(None, in_axis)
-    scale_sharding = P(out_axis, )
-    out_sharding = P(None, out_axis)
-    x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, x_sharding))
-    def wrapper(x, w_q, w_s):
-        output = quantized_matmul_kernel(x, w_q, w_s, x_q_dtype=w_q.dtype)
-        if in_axis:
-            output = jax.lax.psum(output, axis_name=in_axis)
-        return output
-    return shard_map(wrapper,
-                     mesh=mesh,
-                     in_specs=(x_sharding, weight_sharding, scale_sharding),
-                     out_specs=(out_sharding),
-                     check_rep=False)(x, w_q, w_s)
+                             mesh: Mesh, weight_sharding: P) -> jax.Array:
+    """
+    Wrapper around the quantized matmul kernel.
+    Args:
+        x:  Activation.
+        w_q: Weight quantized array. [n_output_features, n_input_features]
+        w_s: Weight quantization scale. [n_output_features]
+        mesh: Mesh to shard on.
+        weight_sharding: PartitionSpec for the weight tensor.
+    Returns:
+        Output of the quantized matmul.
+    """
+    # NOTE (jacobplatin/kyuyeunk) there have been numeric issues (concerning) NaNs
+    # with the kernel and thus we disable it for now.
+    if envs.ENABLE_QUANTIZED_MATMUL_KERNEL:
+        out_axis, in_axis = weight_sharding
+        x_sharding = P(None, in_axis)
+        scale_sharding = P(out_axis, )
+        out_sharding = P(None, out_axis)
+        x = jax.lax.with_sharding_constraint(x,
+                                             NamedSharding(mesh, x_sharding))
+        def wrapper(x, w_q, w_s):
+            output = quantized_matmul_kernel(x, w_q, w_s, x_q_dtype=w_q.dtype)
+            if in_axis:
+                output = jax.lax.psum(output, axis_name=in_axis)
+            return output
+        return shard_map(wrapper,
+                         mesh=mesh,
+                         in_specs=(x_sharding, weight_sharding,
+                                   scale_sharding),
+                         out_specs=(out_sharding),
+                         check_rep=False)(x, w_q, w_s)
+    else:
+        return xla_quantized_matmul(x, w_q, w_s)
 def reorder_concatenated_tensor_for_sharding(concatenated_tensor: jax.Array,

tpu_inference/layers/vllm/quantization/common.py CHANGED Viewed

@@ -31,17 +31,17 @@ class JaxCommonLinearConfig:
         self.output_sizes = [layer.output_size]
         self.weight_sharding = P(None, None)
         self.fuse_matmuls = True
-        self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism
+        self.enable_sp = vllm_config.compilation_config.pass_config.enable_sp
         self.input_sharding = None
         self.output_sharding = None
         if isinstance(layer, RowParallelLinear):
             self.weight_sharding = P(None, "model")
-            if self.enable_sequence_parallelism:
+            if self.enable_sp:
                 self.output_sharding = P("model", None)
         elif isinstance(layer, ColumnParallelLinear):
             self.weight_sharding = P("model", None)
-            if self.enable_sequence_parallelism:
+            if self.enable_sp:
                 self.input_sharding = P("model", None)
             if isinstance(layer, MergedColumnParallelLinear) or isinstance(
@@ -61,10 +61,15 @@ class JaxCommonLinearConfig:
                 " bad performance.", type(layer))
         self.bias_sharding = P(self.weight_sharding[0])
-        self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
+        if isinstance(self.weight_sharding[0], tuple):
+            self.n_shards = 1
+            for axis in self.weight_sharding[0]:
+                self.n_shards *= self.mesh.shape.get(axis, 1)
+        else:
+            self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
     def get_input_sharding(self, x: torchax.tensor.Tensor):
-        if self.enable_sequence_parallelism:
+        if self.enable_sp:
             token_num = x.shape[0]
             # NOTE(chengjiyao): make sure the sharded token_num is larger than TPU_SECOND_LAST_MINOR
             if token_num // self.mesh.shape["model"] >= TPU_SECOND_LAST_MINOR:
@@ -74,7 +79,7 @@ class JaxCommonLinearConfig:
         return self.input_sharding
     def get_output_sharding(self, x: torchax.tensor.Tensor):
-        if self.enable_sequence_parallelism:
+        if self.enable_sp:
             token_num = x.shape[0]
             # NOTE(chengjiyao): make sure the sharded token_num is larger than TPU_SECOND_LAST_MINOR
             if token_num // self.mesh.shape["model"] >= TPU_SECOND_LAST_MINOR:

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -20,7 +20,7 @@ from tpu_inference.layers.common.quant_methods import (COMPRESSED_TENSORS,
                                                        get_tpu_quant_method)
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import \
-    VllmCompressedTensorsW8A8Fp8MoEMethod
+    VllmCompressedTensorsMoEMethod
 from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import \
     VllmCompressedTensorsW8A8Fp8
 from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import \
@@ -113,8 +113,9 @@ class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, FusedMoE):
-            return VllmCompressedTensorsW8A8Fp8MoEMethod(
-                self, layer.quant_config, self.mesh)
+            layer.moe_config = self.get_moe_config(layer)
+            return VllmCompressedTensorsMoEMethod.get_moe_method(
+                self, layer, layer_name=prefix)
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         return None

tpu-inference 0.11.1.dev202511180814__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511180814py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl