PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (56) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_envs.py +11 -32
tests/test_utils.py +2 -1
tpu_inference/__init__.py +3 -22
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +2 -9
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +145 -266
tpu_inference/layers/common/attention_interface.py +1 -7
tpu_inference/layers/common/sharding.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/mxfp4.py +73 -138
tpu_inference/layers/vllm/quantization/unquantized.py +64 -58
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +10 -43
tpu_inference/models/jax/llama3.py +1 -2
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +1 -2
tpu_inference/models/jax/qwen2_5_vl.py +48 -163
tpu_inference/models/jax/qwen3.py +1 -2
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +8 -14
tpu_inference/platforms/tpu_platform.py +31 -37
tpu_inference/runner/compilation_manager.py +58 -141
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +147 -271
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +13 -36
tpu_inference/worker/tpu_worker.py +25 -162
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/RECORD +55 -50
tpu_inference/models/jax/llama_guard_4.py +0 -361
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/fused_moe.py CHANGED Viewed

@@ -2,16 +2,17 @@ import functools
 import jax
 from jax import numpy as jnp
-from jax import shard_map
 from jax.experimental.pallas.ops.tpu.megablox.gmm import gmm
-from jax.sharding import Mesh
-from jax.sharding import PartitionSpec as P
+from jax.experimental.shard_map import shard_map
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from tpu_inference.layers.vllm.linear_common import \
     slice_sharded_tensor_for_concatenation
+P = PartitionSpec
-def activation_fn(activation: str, x1: jax.Array, x2: jax.Array) -> jax.Array:
+def activation_fn(activation: str, x1, x2):
     match activation:
         case "silu":
             return jax.nn.silu(x1) * x2
@@ -22,10 +23,7 @@ def activation_fn(activation: str, x1: jax.Array, x2: jax.Array) -> jax.Array:
                 f"FusedMoE does not support {activation} activation")
-def _swigluoai(x1: jax.Array,
-               x2: jax.Array,
-               alpha=1.702,
-               limit=7.0) -> jax.Array:
+def _swigluoai(x1, x2, alpha=1.702, limit=7.0):
     x1 = jnp.clip(x1, a_max=limit)
     x2 = jnp.clip(x2, a_min=-limit, a_max=limit)
@@ -105,53 +103,40 @@ def tensor_sharded_gmm_merged_column_parallel(
     rhs: jax.Array,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
+    transpose_rhs: bool,
     mesh: Mesh,
-) -> tuple[jax.Array, jax.Array]:
-    def _gmm(lhs, rhs, group_sizes):
-        m, g, n, k = lhs.shape[0], *rhs.shape
-        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-        return gmm(
-            lhs,
-            rhs,
-            group_sizes,
-            preferred_element_type=lhs.dtype,
-            tiling=(tm, tk, tn),
-            transpose_rhs=True,
-            group_offset=jnp.array(0),
-        )
+    intermediate_size: int,
+) -> jax.Array:
+    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
+    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
+    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    _gmm = functools.partial(
+        gmm,
+        preferred_element_type=lhs.dtype,
+        tiling=(tm, tk, tn),
+        transpose_rhs=transpose_rhs,
+        group_offset=jnp.array(0),
+    )
     gmm_result = shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P("data", None), P(None, "model", None), P("data")),
-        out_specs=(P("data", "model")),
-        check_vma=False,
+        in_specs=(P(), P(None, "model", None), P()),
+        out_specs=(P(None, "model")),
+        check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
+        rhs_bis = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
+        gmm_result = (gmm_result + rhs_bis).astype(gmm_result.dtype)
-        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
-            rhs_bias = jnp.repeat(
-                rhs_bias_local,
-                group_sizes_global,
-                0,
-                total_repeat_length=gmm_result_local.shape[0])
-            return gmm_result_local + rhs_bias
-        gmm_result = shard_map(
-            _add_bias,
-            mesh=mesh,
-            in_specs=(P("data", "model"), P(None, "model"), P("data")),
-            out_specs=(P("data", "model")),
-        )(gmm_result, rhs_bias, group_sizes)
-    gmm_result = gmm_result.astype(lhs.dtype)
-    tp_size = mesh.shape["model"]
-    intermediate_size = gmm_result.shape[-1] // 2
+    n_shards = mesh.shape["model"]
     output_sizes = [intermediate_size, intermediate_size]
     return slice_sharded_tensor_for_concatenation(gmm_result, output_sizes,
-                                                  tp_size)
+                                                  n_shards)
 def tensor_sharded_gmm_row_parallel(
@@ -159,75 +144,74 @@ def tensor_sharded_gmm_row_parallel(
     rhs: jax.Array,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
+    transpose_rhs: bool,
     mesh: Mesh,
 ) -> jax.Array:
+    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
+    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
+    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    _gmm = functools.partial(
+        gmm,
+        preferred_element_type=lhs.dtype,
+        tiling=(tm, tk, tn),
+        transpose_rhs=transpose_rhs,
+        group_offset=jnp.array(0),
+    )
     def _gmm_all_reduce(lhs, rhs, group_sizes):
-        m, g, n, k = lhs.shape[0], *rhs.shape
-        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-        out = gmm(
-            lhs,
-            rhs,
-            group_sizes,
-            preferred_element_type=lhs.dtype,
-            tiling=(tm, tk, tn),
-            transpose_rhs=True,
-            group_offset=jnp.array(0),
-        )
-        return jax.lax.psum(out, axis_name="model")
+        r = _gmm(lhs, rhs, group_sizes)
+        return jax.lax.psum(r, axis_name="model")
     gmm_result = shard_map(
         _gmm_all_reduce,
         mesh=mesh,
-        in_specs=(P("data", "model"), P(None, None, "model"), P("data")),
-        out_specs=(P("data")),
-        check_vma=False,
+        in_specs=(P(None, "model"), P(None, None, "model"), P()),
+        out_specs=(P()),
+        check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
+        rhs_bias = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
+        gmm_result = (gmm_result + rhs_bias).astype(gmm_result.dtype)
-        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
-            rhs_bias = jnp.repeat(
-                rhs_bias_local,
-                group_sizes_global,
-                0,
-                total_repeat_length=gmm_result_local.shape[0])
-            return gmm_result_local + rhs_bias
-        gmm_result = shard_map(
-            _add_bias,
-            mesh=mesh,
-            in_specs=(P("data"), P(), P("data")),
-            out_specs=(P("data")),
-        )(gmm_result, rhs_bias, group_sizes)
-    return gmm_result.astype(lhs.dtype)
+    return gmm_result
 def expert_sharded_gmm(
     lhs: jax.Array,
     rhs: jax.Array,
     group_sizes: jax.Array,
+    transpose_rhs: bool,
     mesh: Mesh,
+    num_experts: int,
+    ep_size: int,
 ) -> jax.Array:
-    ep_size = mesh.shape["model"]
+    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
+    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
+    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
-    num_experts = rhs.shape[0]
     num_experts_per_shard = num_experts // ep_size
     group_offset = jnp.arange(0, num_experts, num_experts_per_shard)
+    group_offset = jax.lax.with_sharding_constraint(
+        group_offset, NamedSharding(mesh, P("model")))
     def _gmm(lhs, rhs, group_sizes, group_offset):
-        m, g, n, k = lhs.shape[0], *rhs.shape
-        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+        # Group offset for this shard. `group_offset` is sharded, and in this
+        # sharded function, it has only 1 element and `group_offset.shape` is
+        # (1,) but gmm kernel requires the group_offset to be a ()-shaped array,
+        # so we group_offset[0].
+        group_offset_of_shard = group_offset[0]
         gmm_res = gmm(
             lhs=lhs,
             rhs=rhs,
             group_sizes=group_sizes,
             preferred_element_type=lhs.dtype,
             tiling=(tm, tk, tn),
-            transpose_rhs=True,
-            group_offset=group_offset[0],
+            transpose_rhs=transpose_rhs,
+            group_offset=group_offset_of_shard,
         )
         return gmm_res
@@ -254,24 +238,30 @@ def expert_sharded_gmm(
         mesh=mesh,
         in_specs=(P(), P("model", None, None), P(), P("model")),
         out_specs=(P("model", None)),
-        check_vma=False,
+        check_rep=False,
     )(lhs, rhs, group_sizes, group_offset)
     # For i-th shard, it is responsible groups (AKA experts) from
     # i*num_experts_per_shard to (i+1)*num_experts_per_shard We sum them up to
     # get total rows in that shard, and that is the size for shard to send to
     # its peers. This is also the number of non-zero rows from the gmm results.
-    # In the working example, send_sizes would be [3, 2, 5, 4].
-    # group_sizes has shape of [num_tokens_per_shard * num_experts_per_shard].
-    # So reshaping to [num_tokens_per_shard, num_experts_per_shard] and applying
-    # sum(axis=1) will get desired send_sizes shaped [num_tokens_per_shard].
-    send_sizes = group_sizes.reshape(-1, num_experts_per_shard).sum(axis=1)
+    # In the working example, send_sizes would be [3, 2, 5, 4]
+    send_sizes = jnp.array([
+        group_sizes[i * num_experts_per_shard:(i + 1) *
+                    num_experts_per_shard].sum() for i in range(ep_size)
+    ])
     # In the working example, input_offsets would be [0, 3, 5, 10]
     input_offsets = jnp.concatenate((jnp.array([0]), send_sizes.cumsum()[:-1]))
     output_offsets = input_offsets
     recv_sizes = send_sizes
+    input_offsets = jax.lax.with_sharding_constraint(
+        input_offsets, NamedSharding(mesh, P("model")))
+    send_sizes = jax.lax.with_sharding_constraint(
+        send_sizes, NamedSharding(mesh, P("model")))
+    output_offsets = jax.lax.with_sharding_constraint(
+        output_offsets, NamedSharding(mesh, P("model")))
     def _ragged_all_to_all(operand, input_offsets, send_sizes, output_offsets,
                            recv_sizes):
         output = jnp.zeros_like(operand)
@@ -326,20 +316,10 @@ def expert_sharded_gmm(
         mesh=mesh,
         in_specs=(P("model", None), P("model"), P("model"), P("model"), P()),
         out_specs=(P()),
-        check_vma=False,
+        check_rep=False,
     )(gmm_res, input_offsets, send_sizes, output_offsets, recv_sizes)
-@functools.partial(
-    jax.jit,
-    static_argnames=(
-        "topk",
-        "renormalize",
-        "mesh",
-        "use_ep",
-        "activation",
-    ),
-)
 def fused_moe_func(
     hidden_states: jax.Array,
     w1: jax.Array,
@@ -348,45 +328,37 @@ def fused_moe_func(
     w2_bias: jax.Array | None,
     gating_output: jax.Array,
     topk: int,
+    global_num_experts: int,
     renormalize: bool,
+    reduce_results: bool,
     mesh: Mesh,
     use_ep: bool,
     activation: str,
-) -> jax.Array:
+):
     """
-    Route tokens in hidden_states into each experts based on routing
-    information in gating_out and performs moe with w1 and w2 weights.
     Args:
-        hidden_states: [num_tokens, hidden_size]
-        w1: first moe weights [num_experts, intermediate_size * 2, hidden_size]
-        w2: second moe weights [num_experts, hidden_size, intermediate_size]
-        w1_bias: optional bias of w1 [num_experts, intermediate_size * 2]
-        w2_bias: optional bias of w2 [num_experts, hidden_size]
-        gating_output: routing information of tokens [num_tokens, num_experts]
-        topk: number of experts to choose per token.
-        renormalize: normalize gating_output.
-        mesh: mesh to perform moe.
-        use_ep: use expert parallelism.
-        activation: activation function to perform on the output of w1.
-    Returns:
-        Output of moe operation [num_tokens, hidden_size]
+        hidden_states: [*, hidden_size]
+        w1: [num_experts, intermediate_size * 2, hidden_size]
+        w2: [num_experts, hidden_size, intermediate_size]
+        gating_output: [*, num_experts]
     """
+    # adapted from https://github.com/vllm-project/vllm/blob/29fa5cac1cd731026f59084d93a822921507573c/vllm/model_executor/layers/fused_moe/moe_pallas.py#L26
     if use_ep and (w1_bias is not None or w2_bias is not None):
         raise NotImplementedError(
             "Bias is not supported when using expert parallelism.")
-    num_tokens = hidden_states.shape[0]
-    global_num_experts, hidden_size, intermediate_size = w2.shape
+    orig_shape = hidden_states.shape
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.size // hidden_size
+    assert global_num_experts == w1.shape[0]
+    ep_size = mesh.shape["model"]  # only used if use_ep is True.
+    intermediate_size = w2.shape[-1]
     dtype = hidden_states.dtype
     assert (num_tokens * topk) % 16 == 0, (
         "The kernel requires num_tokens * topk to be a multiple of "
         f"16 but got {num_tokens}*{topk}={num_tokens*topk}")
-    assert hidden_states.shape == (num_tokens, hidden_size)
-    assert gating_output.shape == (num_tokens, global_num_experts)
-    assert w1.shape == (global_num_experts, intermediate_size * 2, hidden_size)
+    hidden_states = hidden_states.reshape(num_tokens, hidden_size)
+    gating_output = gating_output.reshape(num_tokens, global_num_experts)
     topk_weights = jax.nn.softmax(gating_output.astype(jnp.float32), axis=-1)
     topk_weights, topk_indices = jax.lax.top_k(topk_weights, k=topk)
@@ -394,76 +366,142 @@ def fused_moe_func(
         topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdims=True)
     topk_weights = topk_weights.astype(dtype)
-    def _process_tokens_locally(hidden_states_local, topk_indices_local):
-        num_tokens_local = hidden_states_local.shape[0]
-        topk_indices_flat = topk_indices_local.flatten()
-        topk_argsort_indices = jnp.argsort(topk_indices_flat)
-        topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
-        token_indices = jnp.arange(num_tokens_local,
-                                   dtype=jnp.int32).repeat(topk)
-        token_indices_sorted = token_indices[topk_argsort_indices]
-        group_sizes_local = jnp.bincount(topk_indices_flat,
-                                         length=global_num_experts)
-        x = hidden_states_local[token_indices_sorted]
-        return x, group_sizes_local, topk_argsort_revert_indices
-    x, group_sizes, topk_argsort_revert_indices = shard_map(
-        _process_tokens_locally,
-        mesh=mesh,
-        in_specs=(P("data", None), P("data", None)),
-        out_specs=(P("data", None), P("data"), P("data")),
-    )(hidden_states, topk_indices)
+    topk_indices_flat = topk_indices.flatten()
+    topk_argsort_indices = jnp.argsort(topk_indices_flat)
+    topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
+    token_indices = jnp.arange(num_tokens, dtype=jnp.int32).repeat(topk)
+    token_indices_sorted = token_indices[topk_argsort_indices]
+    group_sizes = jnp.bincount(topk_indices_flat, length=global_num_experts)
+    x = hidden_states[token_indices_sorted]
     if use_ep:
         x = expert_sharded_gmm(
             x,
             w1,
             group_sizes,
+            transpose_rhs=True,
             mesh=mesh,
+            num_experts=global_num_experts,
+            ep_size=ep_size,
         )
-        x1, x2 = jnp.split(x, 2, -1)
-        x = activation_fn(activation, x1, x2)
-        x = expert_sharded_gmm(
-            x,
-            w2,
-            group_sizes,
-            mesh=mesh,
-        )
+        x1, x2 = x[..., :intermediate_size], x[..., intermediate_size:]
     else:
         x1, x2 = tensor_sharded_gmm_merged_column_parallel(
             x,
             w1,
             w1_bias,
             group_sizes,
+            transpose_rhs=True,
             mesh=mesh,
+            intermediate_size=intermediate_size,
         )
-        x = activation_fn(activation, x1, x2)
+    x = activation_fn(activation, x1, x2)
+    if use_ep:
+        x = expert_sharded_gmm(
+            x,
+            w2,
+            group_sizes,
+            transpose_rhs=True,
+            mesh=mesh,
+            num_experts=global_num_experts,
+            ep_size=ep_size,
+        )
+    else:
+        x = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P(None, "model")))
         x = tensor_sharded_gmm_row_parallel(
             x,
             w2,
             w2_bias,
             group_sizes,
+            transpose_rhs=True,
             mesh=mesh,
         )
-    def _finalize_output(x_local, topk_argsort_revert_indices_local,
-                         topk_weights_local):
-        x_local = x_local[topk_argsort_revert_indices_local].reshape(
-            -1, topk, hidden_size)
-        x_local = x_local * jnp.expand_dims(topk_weights_local, axis=-1)
-        x_local = x_local.sum(axis=-2)
-        return x_local
+    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
+    x = x * jnp.expand_dims(topk_weights, axis=-1)
+    x = x.sum(axis=-2)
+    x = x.reshape(orig_shape)
-    x = shard_map(
-        _finalize_output,
-        mesh=mesh,
-        in_specs=(P("data", None), P("data"), P("data", None)),
-        out_specs=(P("data", None)),
-    )(x, topk_argsort_revert_indices, topk_weights)
+    if reduce_results:
+        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P()))
+    return x
-    return x[:num_tokens, :hidden_size]
+@functools.partial(
+    jax.jit,
+    static_argnames=(
+        "topk",
+        "global_num_experts",
+        "renormalize",
+        "reduce_results",
+        "mesh",
+        "use_ep",
+        "activation",
+    ),
+)
+def fused_moe_func_padded(
+    hidden_states: jax.Array,
+    w1: jax.Array,
+    w2: jax.Array,
+    w1_bias: jax.Array | None,
+    w2_bias: jax.Array | None,
+    gating_output: jax.Array,
+    topk: int,
+    global_num_experts: int,
+    renormalize: bool,
+    reduce_results: bool,
+    mesh: Mesh,
+    use_ep: bool,
+    activation: str,
+):
+    # TODO(fanhongmin@google.com): Once the jax runner pads the input, we no longer need this.
+    hidden_size = hidden_states.shape[-1]
+    num_tokens = hidden_states.size // hidden_size
+    if num_tokens * topk < 16:
+        assert 16 % (num_tokens *
+                     topk) == 0, f"Cannot pad to 16: {num_tokens=}, {topk=}"
+        n_repeats = 16 // (num_tokens * topk)
+        reps = (n_repeats, ) + (1, ) * (hidden_states.ndim - 1)
+        expanded_hidden_states = jnp.tile(hidden_states, reps)
+        reps = (n_repeats, ) + (1, ) * (gating_output.ndim - 1)
+        expanded_gating_output = jnp.tile(gating_output, reps)
+        expanded_x = fused_moe_func(
+            expanded_hidden_states,
+            w1,
+            w2,
+            w1_bias,
+            w2_bias,
+            expanded_gating_output,
+            topk,
+            global_num_experts,
+            renormalize,
+            reduce_results,
+            mesh,
+            use_ep,
+            activation,
+        )
+        x = expanded_x[:hidden_states.shape[0]]
+        return x
+    else:
+        return fused_moe_func(
+            hidden_states,
+            w1,
+            w2,
+            w1_bias,
+            w2_bias,
+            gating_output,
+            topk,
+            global_num_experts,
+            renormalize,
+            reduce_results,
+            mesh,
+            use_ep,
+            activation,
+        )

tpu_inference/layers/vllm/quantization/common.py CHANGED Viewed

@@ -61,12 +61,7 @@ class JaxCommonLinearConfig:
                 " bad performance.", type(layer))
         self.bias_sharding = P(self.weight_sharding[0])
-        if isinstance(self.weight_sharding[0], tuple):
-            self.n_shards = 1
-            for axis in self.weight_sharding[0]:
-                self.n_shards *= self.mesh.shape.get(axis, 1)
-        else:
-            self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
+        self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
     def get_input_sharding(self, x: torchax.tensor.Tensor):
         if self.enable_sequence_parallelism:

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl