PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202511270815__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202511270815py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (49) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tpu_inference/__init__.py +22 -3
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +2 -3
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +1 -1
tpu_inference/executors/ray_distributed_executor.py +27 -11
tpu_inference/kernels/fused_moe/v1/kernel.py +641 -110
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +141 -107
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +2 -1
tpu_inference/layers/vllm/fused_moe.py +74 -25
tpu_inference/layers/vllm/quantization/common.py +6 -1
tpu_inference/layers/vllm/quantization/mxfp4.py +135 -61
tpu_inference/layers/vllm/quantization/unquantized.py +107 -113
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +43 -11
tpu_inference/models/jax/llama3.py +2 -1
tpu_inference/models/jax/llama_eagle3.py +8 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +2 -1
tpu_inference/models/jax/qwen2_5_vl.py +163 -48
tpu_inference/models/jax/qwen3.py +2 -1
tpu_inference/models/jax/utils/weight_utils.py +198 -143
tpu_inference/models/vllm/vllm_model_wrapper.py +13 -5
tpu_inference/platforms/tpu_platform.py +15 -2
tpu_inference/runner/compilation_manager.py +58 -33
tpu_inference/runner/kv_cache_manager.py +9 -3
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +203 -102
tpu_inference/spec_decode/jax/eagle3.py +19 -2
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +5 -4
tpu_inference/worker/tpu_worker.py +160 -23
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202511270815.dist-info}/METADATA +3 -2
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202511270815.dist-info}/RECORD +43 -48
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202511270815.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202511270815.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202511270815.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/fused_moe.py CHANGED Viewed

@@ -110,7 +110,8 @@ def tensor_sharded_gmm_merged_column_parallel(
     # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
     m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
     n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m // mesh.shape["data"], k, n,
+                                                 g)
     _gmm = functools.partial(
         gmm,
@@ -123,14 +124,26 @@ def tensor_sharded_gmm_merged_column_parallel(
     gmm_result = shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P(), P(None, "model", None), P()),
-        out_specs=(P(None, "model")),
+        in_specs=(P("data", None), P(None, "model", None), P("data")),
+        out_specs=(P("data", "model")),
         check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
-        rhs_bis = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
-        gmm_result = (gmm_result + rhs_bis).astype(gmm_result.dtype)
+        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
+            rhs_bis = jnp.repeat(rhs_bias_local,
+                                 group_sizes_global,
+                                 0,
+                                 total_repeat_length=m // mesh.shape["data"])
+            return (gmm_result_local + rhs_bis).astype(gmm_result_local.dtype)
+        gmm_result = shard_map(
+            _add_bias,
+            mesh=mesh,
+            in_specs=(P("data", "model"), P(None, "model"), P("data")),
+            out_specs=(P("data", "model")),
+        )(gmm_result, rhs_bias, group_sizes)
     n_shards = mesh.shape["model"]
     output_sizes = [intermediate_size, intermediate_size]
@@ -150,7 +163,8 @@ def tensor_sharded_gmm_row_parallel(
     # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
     m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
     n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m // mesh.shape["data"], k, n,
+                                                 g)
     _gmm = functools.partial(
         gmm,
@@ -167,14 +181,25 @@ def tensor_sharded_gmm_row_parallel(
     gmm_result = shard_map(
         _gmm_all_reduce,
         mesh=mesh,
-        in_specs=(P(None, "model"), P(None, None, "model"), P()),
-        out_specs=(P()),
+        in_specs=(P("data", "model"), P(None, None, "model"), P("data")),
+        out_specs=(P("data")),
         check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
-        rhs_bias = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
-        gmm_result = (gmm_result + rhs_bias).astype(gmm_result.dtype)
+        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
+            rhs_bis = jnp.repeat(rhs_bias_local,
+                                 group_sizes_global,
+                                 0,
+                                 total_repeat_length=m // mesh.shape["data"])
+            return (gmm_result_local + rhs_bis).astype(gmm_result_local.dtype)
+        gmm_result = shard_map(
+            _add_bias,
+            mesh=mesh,
+            in_specs=(P("data"), P(), P("data")),
+            out_specs=(P("data")),
+        )(gmm_result, rhs_bias, group_sizes)
     return gmm_result
@@ -366,15 +391,27 @@ def fused_moe_func(
         topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdims=True)
     topk_weights = topk_weights.astype(dtype)
-    topk_indices_flat = topk_indices.flatten()
-    topk_argsort_indices = jnp.argsort(topk_indices_flat)
-    topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
-    token_indices = jnp.arange(num_tokens, dtype=jnp.int32).repeat(topk)
-    token_indices_sorted = token_indices[topk_argsort_indices]
-    group_sizes = jnp.bincount(topk_indices_flat, length=global_num_experts)
-    x = hidden_states[token_indices_sorted]
+    def _process_tokens_locally(hidden_states_local, topk_indices_local):
+        num_tokens_local = hidden_states_local.shape[0]
+        topk_indices_flat = topk_indices_local.flatten()
+        topk_argsort_indices = jnp.argsort(topk_indices_flat)
+        topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
+        token_indices = jnp.arange(num_tokens_local,
+                                   dtype=jnp.int32).repeat(topk)
+        token_indices_sorted = token_indices[topk_argsort_indices]
+        group_sizes_local = jnp.bincount(topk_indices_flat,
+                                         length=global_num_experts)
+        x = hidden_states_local[token_indices_sorted]
+        return x, group_sizes_local, topk_argsort_revert_indices
+    x, group_sizes, topk_argsort_revert_indices = shard_map(
+        _process_tokens_locally,
+        mesh=mesh,
+        in_specs=(P("data", None), P("data", None)),
+        out_specs=(P("data", None), P("data"), P("data")),
+        check_rep=False,
+    )(hidden_states, topk_indices)
     if use_ep:
         x = expert_sharded_gmm(
             x,
@@ -411,7 +448,7 @@ def fused_moe_func(
         )
     else:
         x = jax.lax.with_sharding_constraint(
-            x, NamedSharding(mesh, P(None, "model")))
+            x, NamedSharding(mesh, P("data", "model")))
         x = tensor_sharded_gmm_row_parallel(
             x,
             w2,
@@ -421,13 +458,25 @@ def fused_moe_func(
             mesh=mesh,
         )
-    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
-    x = x * jnp.expand_dims(topk_weights, axis=-1)
-    x = x.sum(axis=-2)
+    def _finalize_output(x_local, topk_argsort_revert_indices_local,
+                         topk_weights_local):
+        x_local = x_local[topk_argsort_revert_indices_local].reshape(
+            -1, topk, hidden_size)
+        x_local = x_local * jnp.expand_dims(topk_weights_local, axis=-1)
+        x_local = x_local.sum(axis=-2)
+        return x_local
+    x = shard_map(
+        _finalize_output,
+        mesh=mesh,
+        in_specs=(P("data", None), P("data"), P("data", None)),
+        out_specs=(P("data", None)),
+        check_rep=False,
+    )(x, topk_argsort_revert_indices, topk_weights)
     x = x.reshape(orig_shape)
     if reduce_results:
-        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P()))
+        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P("data")))
     return x

tpu_inference/layers/vllm/quantization/common.py CHANGED Viewed

@@ -61,7 +61,12 @@ class JaxCommonLinearConfig:
                 " bad performance.", type(layer))
         self.bias_sharding = P(self.weight_sharding[0])
-        self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
+        if isinstance(self.weight_sharding[0], tuple):
+            self.n_shards = 1
+            for axis in self.weight_sharding[0]:
+                self.n_shards *= self.mesh.shape.get(axis, 1)
+        else:
+            self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
     def get_input_sharding(self, x: torchax.tensor.Tensor):
         if self.enable_sequence_parallelism:

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -24,6 +24,8 @@ from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
 from vllm.model_executor.layers.quantization.utils.quant_utils import \
     is_layer_skipped
+from tpu_inference import envs
+from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (MXFP4,
                                                        get_tpu_quant_method)
 from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
@@ -103,13 +105,30 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
 class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
-    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+    def __init__(self,
+                 moe: FusedMoEConfig,
+                 mesh: Mesh,
+                 ep_axis_name: str = 'model'):
         FusedMoEMethodBase.__init__(self, moe)
         # We piggyback on triton implementation as it applies minimal hardware
         # specific post processing to the weights.
         self.mxfp4_backend = Mxfp4Backend.TRITON
         self.mesh = mesh
+        self.use_kernel = envs.USE_MOE_EP_KERNEL
+        self.ep_axis_name = ep_axis_name
+        # TODO: Use autotune table once we have it.
+        self.block_size = {
+            "bt": 64,
+            "bf": 1024,
+            "bd1": 1536,
+            "bd2": 1536,
+            "btc": 64,
+            "bfc": 1024,
+            "bd1c": 1536,
+            "bd2c": 1536,
+        }
     def get_fused_moe_quant_config(
             self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
@@ -122,6 +141,7 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module):
         assert isinstance(layer, FusedMoE)
+        assert layer.moe_config.has_bias, "mxfp4 quantization alwyas use bias."
         w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
         w13_weight_scale = e8m0_to_fp32(
@@ -157,57 +177,95 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             w3_bias = w13_bias[:, 1::2]
             w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
-        if layer.use_ep:
+        if self.use_kernel and layer.use_ep:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
+            w13_reshaped = w13_weight.reshape(num_experts, 2,
+                                              intermediate_size, hidden_size)
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+            # Apply EP sharding
             w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
+                w13_weight_transposed,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
             w2_weight = jax.device_put(
-                w2_weight,
+                w2_weight_transposed,
                 Format(Layout((0, 1, 2)),
                        NamedSharding(self.mesh, P("model", None, None))))
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
+            if self.moe.has_bias:
+                w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+                # Apply EP sharding
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
         else:
-            intermediate_size = w13_weight.shape[1] // 2
-            assert intermediate_size == w2_weight.shape[-1]
-            output_sizes = [intermediate_size, intermediate_size]
-            n_shards = self.mesh.shape["model"]
-            assert intermediate_size % n_shards == 0
-            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
-                                                                  output_sizes,
-                                                                  n_shards,
-                                                                  dim=1)
-            w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, "model", None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, None, "model"))))
-            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
-                                                                output_sizes,
-                                                                n_shards,
-                                                                dim=1)
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P(None, "model"))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
-                                                                  None))))
+            if layer.use_ep:
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+            else:
+                intermediate_size = w13_weight.shape[1] // 2
+                assert intermediate_size == w2_weight.shape[-1]
+                output_sizes = [intermediate_size, intermediate_size]
+                n_shards = self.mesh.shape["model"]
+                assert intermediate_size % n_shards == 0
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight, output_sizes, n_shards, dim=1)
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, "model", None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, None, "model"))))
+                w13_bias = reorder_concatenated_tensor_for_sharding(
+                    w13_bias, output_sizes, n_shards, dim=1)
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P(None, "model"))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P(None, None))))
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
@@ -246,21 +304,37 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        # Use the original implementation
-        output = fused_moe_func_padded(
-            jax_view(x),
-            jax_view(layer.w13_weight),
-            jax_view(layer.w2_weight),
-            jax_view(layer.w13_bias) if self.moe.has_bias else None,
-            jax_view(layer.w2_bias) if self.moe.has_bias else None,
-            jax_view(router_logits),
-            topk=top_k,
-            global_num_experts=global_num_experts,
-            renormalize=renormalize,
-            reduce_results=layer.reduce_results,
-            mesh=self.mesh,
-            use_ep=layer.use_ep,
-            activation=activation,
-        )
+        if self.use_kernel and layer.use_ep:
+            output = fused_ep_moe(
+                mesh=self.mesh,
+                tokens=jax_view(x),
+                w1=jax_view(layer.w13_weight),
+                w2=jax_view(layer.w2_weight),
+                b1=jax_view(layer.w13_bias),
+                b2=jax_view(layer.w2_bias),
+                gating_output=jax_view(router_logits),
+                top_k=top_k,
+                ep_axis_name=self.ep_axis_name,
+                renormalize_topk_logits=renormalize,
+                act_fn=activation,
+                **self.block_size,
+            )
+        else:
+            # Use the original implementation
+            output = fused_moe_func_padded(
+                jax_view(x),
+                jax_view(layer.w13_weight),
+                jax_view(layer.w2_weight),
+                jax_view(layer.w13_bias),
+                jax_view(layer.w2_bias),
+                jax_view(router_logits),
+                topk=top_k,
+                global_num_experts=global_num_experts,
+                renormalize=renormalize,
+                reduce_results=layer.reduce_results,
+                mesh=self.mesh,
+                use_ep=layer.use_ep,
+                activation=activation,
+            )
         return torch_view(output)

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202511270815__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202511270815py3-none-any.whl