PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.2rc3__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.2rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +89 -26
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +46 -17
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +44 -17
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/METADATA +7 -9
tpu_inference-0.13.2rc3.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/fused_moe.py CHANGED Viewed

@@ -1,18 +1,32 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import functools
 import jax
 from jax import numpy as jnp
-from jax.experimental.pallas.ops.tpu.megablox.gmm import gmm
-from jax.experimental.shard_map import shard_map
-from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from jax.sharding import Mesh, NamedSharding
+from jax.sharding import PartitionSpec as P
+from tpu_inference.kernels.megablox.gmm import gmm
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.vllm.linear_common import \
     slice_sharded_tensor_for_concatenation
-P = PartitionSpec
+from tpu_inference.utils import get_mesh_shape_product
-def activation_fn(activation: str, x1, x2):
+def activation_fn(activation: str, x1: jax.Array, x2: jax.Array) -> jax.Array:
     match activation:
         case "silu":
             return jax.nn.silu(x1) * x2
@@ -23,7 +37,10 @@ def activation_fn(activation: str, x1, x2):
                 f"FusedMoE does not support {activation} activation")
-def _swigluoai(x1, x2, alpha=1.702, limit=7.0):
+def _swigluoai(x1: jax.Array,
+               x2: jax.Array,
+               alpha=1.702,
+               limit=7.0) -> jax.Array:
     x1 = jnp.clip(x1, a_max=limit)
     x2 = jnp.clip(x2, a_min=-limit, a_max=limit)
@@ -101,142 +118,124 @@ def _get_tiling_size_for_gmm_kernel(m: int, k: int, n: int,
 def tensor_sharded_gmm_merged_column_parallel(
     lhs: jax.Array,
     rhs: jax.Array,
+    rhs_scale: jax.Array | None,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
-    transpose_rhs: bool,
     mesh: Mesh,
-    intermediate_size: int,
-) -> jax.Array:
-    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
-    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
-    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m // mesh.shape["data"], k, n,
-                                                 g)
-    _gmm = functools.partial(
-        gmm,
-        preferred_element_type=lhs.dtype,
-        tiling=(tm, tk, tn),
-        transpose_rhs=transpose_rhs,
-        group_offset=jnp.array(0),
-    )
-    gmm_result = shard_map(
+) -> list[jax.Array]:
+    def _gmm(lhs, rhs, rhs_scale, rhs_bias, group_sizes):
+        m, g, n, k = lhs.shape[0], *rhs.shape
+        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+        return gmm(
+            lhs,
+            rhs,
+            group_sizes,
+            rhs_scale=rhs_scale,
+            rhs_bias=rhs_bias,
+            preferred_element_type=lhs.dtype,
+            tiling=(tm, tk, tn),
+            transpose_rhs=True,
+            group_offset=jnp.array(0),
+        )
+    rhs_scale_spec = None if rhs_scale is None else P(
+        None, None, None, ShardingAxisName.MLP_TENSOR)
+    rhs_bias_spec = None if rhs_bias is None else P(
+        None, None, ShardingAxisName.MLP_TENSOR)
+    gmm_result = jax.shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P("data", None), P(None, "model", None), P("data")),
-        out_specs=(P("data", "model")),
-        check_rep=False,
-    )(lhs, rhs, group_sizes)
-    if rhs_bias is not None:
-        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
-            rhs_bis = jnp.repeat(rhs_bias_local,
-                                 group_sizes_global,
-                                 0,
-                                 total_repeat_length=m // mesh.shape["data"])
-            return (gmm_result_local + rhs_bis).astype(gmm_result_local.dtype)
-        gmm_result = shard_map(
-            _add_bias,
-            mesh=mesh,
-            in_specs=(P("data", "model"), P(None, "model"), P("data")),
-            out_specs=(P("data", "model")),
-        )(gmm_result, rhs_bias, group_sizes)
-    n_shards = mesh.shape["model"]
+        in_specs=(P(ShardingAxisName.MLP_DATA,
+                    None), P(None, ShardingAxisName.MLP_TENSOR,
+                             None), rhs_scale_spec, rhs_bias_spec,
+                  P(ShardingAxisName.MLP_DATA)),
+        out_specs=(P(ShardingAxisName.MLP_DATA, ShardingAxisName.MLP_TENSOR)),
+        check_vma=False,
+    )(lhs, rhs, rhs_scale, rhs_bias, group_sizes)
+    tp_size = get_mesh_shape_product(mesh, ShardingAxisName.MLP_TENSOR)
+    intermediate_size = gmm_result.shape[-1] // 2
     output_sizes = [intermediate_size, intermediate_size]
     return slice_sharded_tensor_for_concatenation(gmm_result, output_sizes,
-                                                  n_shards)
+                                                  tp_size)
 def tensor_sharded_gmm_row_parallel(
     lhs: jax.Array,
     rhs: jax.Array,
+    rhs_scale: jax.Array | None,
     rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
-    transpose_rhs: bool,
     mesh: Mesh,
 ) -> jax.Array:
-    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
-    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
-    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m // mesh.shape["data"], k, n,
-                                                 g)
-    _gmm = functools.partial(
-        gmm,
-        preferred_element_type=lhs.dtype,
-        tiling=(tm, tk, tn),
-        transpose_rhs=transpose_rhs,
-        group_offset=jnp.array(0),
-    )
-    def _gmm_all_reduce(lhs, rhs, group_sizes):
-        r = _gmm(lhs, rhs, group_sizes)
-        return jax.lax.psum(r, axis_name="model")
-    gmm_result = shard_map(
+    def _gmm_all_reduce(lhs, rhs, rhs_scale, rhs_bias, group_sizes):
+        m, g, n, k = lhs.shape[0], *rhs.shape
+        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+        if rhs_bias is not None:
+            shard_id = jax.lax.axis_index(ShardingAxisName.MLP_TENSOR).sum()
+            rhs_bias = jnp.where(shard_id == 0, rhs_bias, 0)
+        out = gmm(
+            lhs,
+            rhs,
+            group_sizes,
+            rhs_scale=rhs_scale,
+            rhs_bias=rhs_bias,
+            preferred_element_type=lhs.dtype,
+            tiling=(tm, tk, tn),
+            transpose_rhs=True,
+            group_offset=jnp.array(0),
+        )
+        return jax.lax.psum(out, axis_name=ShardingAxisName.MLP_TENSOR)
+    num_blocks = 1 if rhs_scale is None else rhs_scale.shape[1]
+    rhs_scale_spec = None if num_blocks == 1 else P(
+        None, ShardingAxisName.MLP_TENSOR, None, None)
+    rhs_bias_spec = None if rhs_bias is None else P(None, None, None)
+    gmm_result = jax.shard_map(
         _gmm_all_reduce,
         mesh=mesh,
-        in_specs=(P("data", "model"), P(None, None, "model"), P("data")),
-        out_specs=(P("data")),
-        check_rep=False,
-    )(lhs, rhs, group_sizes)
-    if rhs_bias is not None:
-        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
-            rhs_bis = jnp.repeat(rhs_bias_local,
-                                 group_sizes_global,
-                                 0,
-                                 total_repeat_length=m // mesh.shape["data"])
-            return (gmm_result_local + rhs_bis).astype(gmm_result_local.dtype)
-        gmm_result = shard_map(
-            _add_bias,
-            mesh=mesh,
-            in_specs=(P("data"), P(), P("data")),
-            out_specs=(P("data")),
-        )(gmm_result, rhs_bias, group_sizes)
+        in_specs=(P(ShardingAxisName.MLP_DATA, ShardingAxisName.MLP_TENSOR),
+                  P(None, None, ShardingAxisName.MLP_TENSOR), rhs_scale_spec,
+                  rhs_bias_spec, P(ShardingAxisName.MLP_DATA)),
+        out_specs=(P(ShardingAxisName.MLP_DATA)),
+        check_vma=False,
+    )(lhs, rhs, rhs_scale, rhs_bias, group_sizes)
-    return gmm_result
+    return gmm_result.astype(lhs.dtype)
 def expert_sharded_gmm(
     lhs: jax.Array,
     rhs: jax.Array,
+    rhs_scale: jax.Array | None,
+    rhs_bias: jax.Array | None,
     group_sizes: jax.Array,
-    transpose_rhs: bool,
+    is_last_expert: bool,
     mesh: Mesh,
-    num_experts: int,
-    ep_size: int,
 ) -> jax.Array:
-    # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
-    m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
-    n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    ep_size = get_mesh_shape_product(mesh, ShardingAxisName.MLP_TENSOR)
+    ep_p_spec = P(ShardingAxisName.EXPERT)
+    num_experts = rhs.shape[0]
     num_experts_per_shard = num_experts // ep_size
     group_offset = jnp.arange(0, num_experts, num_experts_per_shard)
-    group_offset = jax.lax.with_sharding_constraint(
-        group_offset, NamedSharding(mesh, P("model")))
-    def _gmm(lhs, rhs, group_sizes, group_offset):
-        # Group offset for this shard. `group_offset` is sharded, and in this
-        # sharded function, it has only 1 element and `group_offset.shape` is
-        # (1,) but gmm kernel requires the group_offset to be a ()-shaped array,
-        # so we group_offset[0].
-        group_offset_of_shard = group_offset[0]
+    def _gmm(lhs, rhs, rhs_scale, rhs_bias, group_sizes, group_offset):
+        m, g, n, k = lhs.shape[0], *rhs.shape
+        tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
         gmm_res = gmm(
             lhs=lhs,
             rhs=rhs,
+            rhs_scale=rhs_scale,
+            rhs_bias=rhs_bias,
             group_sizes=group_sizes,
             preferred_element_type=lhs.dtype,
             tiling=(tm, tk, tn),
-            transpose_rhs=transpose_rhs,
-            group_offset=group_offset_of_shard,
+            transpose_rhs=True,
+            group_offset=group_offset[0],
         )
         return gmm_res
@@ -258,35 +257,43 @@ def expert_sharded_gmm(
     #       0, 0, 0, 0     0, 0, 0, 0     0, 0, 0, 0     D, D, D, D
     #        shard-0        shard-1        shard-2        shard-3
     # Each shards has 3 (row A), 2 (row B), 5 (row C) and 4 (row D).
-    gmm_res = shard_map(
+    lhs_spec = ep_p_spec if is_last_expert else P()
+    rhs_spec = ep_p_spec
+    rhs_scale_spec = None if rhs_scale is None else ep_p_spec
+    rhs_bias_spec = None if rhs_bias is None else ep_p_spec
+    gmm_res = jax.shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P(), P("model", None, None), P(), P("model")),
-        out_specs=(P("model", None)),
-        check_rep=False,
-    )(lhs, rhs, group_sizes, group_offset)
+        in_specs=(
+            lhs_spec,
+            rhs_spec,
+            rhs_scale_spec,
+            rhs_bias_spec,
+            P(),
+            ep_p_spec,
+        ),
+        out_specs=ep_p_spec,
+        check_vma=False,
+    )(lhs, rhs, rhs_scale, rhs_bias, group_sizes, group_offset)
+    if not is_last_expert:
+        return gmm_res
     # For i-th shard, it is responsible groups (AKA experts) from
     # i*num_experts_per_shard to (i+1)*num_experts_per_shard We sum them up to
     # get total rows in that shard, and that is the size for shard to send to
     # its peers. This is also the number of non-zero rows from the gmm results.
-    # In the working example, send_sizes would be [3, 2, 5, 4]
-    send_sizes = jnp.array([
-        group_sizes[i * num_experts_per_shard:(i + 1) *
-                    num_experts_per_shard].sum() for i in range(ep_size)
-    ])
+    # In the working example, send_sizes would be [3, 2, 5, 4].
+    # group_sizes has shape of [num_tokens_per_shard * num_experts_per_shard].
+    # So reshaping to [num_tokens_per_shard, num_experts_per_shard] and applying
+    # sum(axis=1) will get desired send_sizes shaped [num_tokens_per_shard].
+    send_sizes = group_sizes.reshape(-1, num_experts_per_shard).sum(axis=1)
     # In the working example, input_offsets would be [0, 3, 5, 10]
     input_offsets = jnp.concatenate((jnp.array([0]), send_sizes.cumsum()[:-1]))
     output_offsets = input_offsets
     recv_sizes = send_sizes
-    input_offsets = jax.lax.with_sharding_constraint(
-        input_offsets, NamedSharding(mesh, P("model")))
-    send_sizes = jax.lax.with_sharding_constraint(
-        send_sizes, NamedSharding(mesh, P("model")))
-    output_offsets = jax.lax.with_sharding_constraint(
-        output_offsets, NamedSharding(mesh, P("model")))
     def _ragged_all_to_all(operand, input_offsets, send_sizes, output_offsets,
                            recv_sizes):
         output = jnp.zeros_like(operand)
@@ -317,7 +324,7 @@ def expert_sharded_gmm(
                                          send_sizes_of_shard,
                                          output_offsets_of_shard,
                                          recv_sizes_of_shard,
-                                         axis_name="model")
+                                         axis_name=ShardingAxisName.EXPERT)
     # Use ragged_all_to_all to send the result from gmm for each expert to all
     # the shards.  In the working example, the result would be:
@@ -336,56 +343,74 @@ def expert_sharded_gmm(
     #       D, D, D, D     D, D, D, D     D, D, D, D     D, D, D, D
     #       D, D, D, D     D, D, D, D     D, D, D, D     D, D, D, D
     #        shard-0        shard-1        shard-2        shard-3
-    return shard_map(
+    return jax.shard_map(
         _ragged_all_to_all,
         mesh=mesh,
-        in_specs=(P("model", None), P("model"), P("model"), P("model"), P()),
-        out_specs=(P()),
-        check_rep=False,
+        in_specs=(ep_p_spec, ep_p_spec, ep_p_spec, ep_p_spec, P()),
+        out_specs=(P(ShardingAxisName.MLP_DATA)),
+        check_vma=False,
     )(gmm_res, input_offsets, send_sizes, output_offsets, recv_sizes)
+@functools.partial(
+    jax.jit,
+    static_argnames=(
+        "topk",
+        "renormalize",
+        "mesh",
+        "use_ep",
+        "activation",
+    ),
+)
 def fused_moe_func(
     hidden_states: jax.Array,
     w1: jax.Array,
     w2: jax.Array,
+    w1_scale: jax.Array | None,
+    w2_scale: jax.Array | None,
     w1_bias: jax.Array | None,
     w2_bias: jax.Array | None,
     gating_output: jax.Array,
     topk: int,
-    global_num_experts: int,
     renormalize: bool,
-    reduce_results: bool,
     mesh: Mesh,
     use_ep: bool,
     activation: str,
-):
-    """
+) -> jax.Array:
+    """Route tokens in hidden_states into each experts based on routing.
     Args:
-        hidden_states: [*, hidden_size]
-        w1: [num_experts, intermediate_size * 2, hidden_size]
-        w2: [num_experts, hidden_size, intermediate_size]
-        gating_output: [*, num_experts]
+        hidden_states: [num_tokens, hidden_size]
+        w1: first moe weights [num_experts, intermediate_size * 2, hidden_size]
+        w2: second moe weights [num_experts, hidden_size, intermediate_size]
+        w1_scale: w1 scale [num_experts, num_blocks, 1, intermediate_size * 2]
+        w2_scale: w2 scale [num_experts, num_blocks, 1, hidden_size]
+        w1_bias: optional bias of w1 [num_experts, 1, intermediate_size * 2]
+        w2_bias: optional bias of w2 [num_experts, 1, hidden_size]
+        gating_output: routing information of tokens [num_tokens, num_experts]
+        topk: number of experts to choose per token.
+        renormalize: normalize gating_output.
+        mesh: mesh to perform moe.
+        use_ep: use expert parallelism.
+        activation: activation function to perform on the output of w1.
+    Returns:
+        Output of moe operation [num_tokens, hidden_size]
     """
-    # adapted from https://github.com/vllm-project/vllm/blob/29fa5cac1cd731026f59084d93a822921507573c/vllm/model_executor/layers/fused_moe/moe_pallas.py#L26
-    if use_ep and (w1_bias is not None or w2_bias is not None):
-        raise NotImplementedError(
-            "Bias is not supported when using expert parallelism.")
-    orig_shape = hidden_states.shape
-    hidden_size = hidden_states.shape[-1]
-    num_tokens = hidden_states.size // hidden_size
-    assert global_num_experts == w1.shape[0]
-    ep_size = mesh.shape["model"]  # only used if use_ep is True.
-    intermediate_size = w2.shape[-1]
+    num_tokens, hidden_size = hidden_states.shape
+    global_num_experts, _, padded_hidden_size = w1.shape
     dtype = hidden_states.dtype
     assert (num_tokens * topk) % 16 == 0, (
         "The kernel requires num_tokens * topk to be a multiple of "
         f"16 but got {num_tokens}*{topk}={num_tokens*topk}")
-    hidden_states = hidden_states.reshape(num_tokens, hidden_size)
-    gating_output = gating_output.reshape(num_tokens, global_num_experts)
+    assert gating_output.shape == (num_tokens, global_num_experts)
     topk_weights = jax.nn.softmax(gating_output.astype(jnp.float32), axis=-1)
+    # All-gather topk weights for attention dp
+    topk_weights = jax.lax.with_sharding_constraint(
+        topk_weights, NamedSharding(mesh, P(ShardingAxisName.MLP_DATA, None)))
     topk_weights, topk_indices = jax.lax.top_k(topk_weights, k=topk)
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdims=True)
@@ -405,152 +430,77 @@ def fused_moe_func(
         x = hidden_states_local[token_indices_sorted]
         return x, group_sizes_local, topk_argsort_revert_indices
-    x, group_sizes, topk_argsort_revert_indices = shard_map(
+    x, group_sizes, topk_argsort_revert_indices = jax.shard_map(
         _process_tokens_locally,
         mesh=mesh,
-        in_specs=(P("data", None), P("data", None)),
-        out_specs=(P("data", None), P("data"), P("data")),
-        check_rep=False,
+        in_specs=(P(ShardingAxisName.MLP_DATA,
+                    None), P(ShardingAxisName.MLP_DATA, None)),
+        out_specs=(P(ShardingAxisName.MLP_DATA, None),
+                   P(ShardingAxisName.MLP_DATA), P(ShardingAxisName.MLP_DATA)),
     )(hidden_states, topk_indices)
+    x = jnp.pad(x, ((0, 0), (0, padded_hidden_size - hidden_size)))
     if use_ep:
         x = expert_sharded_gmm(
             x,
             w1,
-            group_sizes,
-            transpose_rhs=True,
-            mesh=mesh,
-            num_experts=global_num_experts,
-            ep_size=ep_size,
-        )
-        x1, x2 = x[..., :intermediate_size], x[..., intermediate_size:]
-    else:
-        x1, x2 = tensor_sharded_gmm_merged_column_parallel(
-            x,
-            w1,
+            w1_scale,
             w1_bias,
             group_sizes,
-            transpose_rhs=True,
+            is_last_expert=False,
             mesh=mesh,
-            intermediate_size=intermediate_size,
         )
+        x1, x2 = jnp.split(x, 2, -1)
-    x = activation_fn(activation, x1, x2)
+        x = activation_fn(activation, x1, x2)
-    if use_ep:
         x = expert_sharded_gmm(
             x,
             w2,
+            w2_scale,
+            w2_bias,
             group_sizes,
-            transpose_rhs=True,
+            is_last_expert=True,
             mesh=mesh,
-            num_experts=global_num_experts,
-            ep_size=ep_size,
         )
     else:
-        x = jax.lax.with_sharding_constraint(
-            x, NamedSharding(mesh, P("data", "model")))
+        x1, x2 = tensor_sharded_gmm_merged_column_parallel(
+            x,
+            w1,
+            w1_scale,
+            w1_bias,
+            group_sizes,
+            mesh=mesh,
+        )
+        x = activation_fn(activation, x1, x2)
         x = tensor_sharded_gmm_row_parallel(
             x,
             w2,
+            w2_scale,
             w2_bias,
             group_sizes,
-            transpose_rhs=True,
             mesh=mesh,
         )
     def _finalize_output(x_local, topk_argsort_revert_indices_local,
                          topk_weights_local):
         x_local = x_local[topk_argsort_revert_indices_local].reshape(
-            -1, topk, hidden_size)
+            -1, topk, padded_hidden_size)
         x_local = x_local * jnp.expand_dims(topk_weights_local, axis=-1)
         x_local = x_local.sum(axis=-2)
         return x_local
-    x = shard_map(
+    x = jax.shard_map(
         _finalize_output,
         mesh=mesh,
-        in_specs=(P("data", None), P("data"), P("data", None)),
-        out_specs=(P("data", None)),
-        check_rep=False,
+        in_specs=(P(ShardingAxisName.MLP_DATA,
+                    None), P(ShardingAxisName.MLP_DATA),
+                  P(ShardingAxisName.MLP_DATA, None)),
+        out_specs=(P(ShardingAxisName.ATTN_DATA, None)),
+        check_vma=False,
     )(x, topk_argsort_revert_indices, topk_weights)
-    x = x.reshape(orig_shape)
-    if reduce_results:
-        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P("data")))
-    return x
-@functools.partial(
-    jax.jit,
-    static_argnames=(
-        "topk",
-        "global_num_experts",
-        "renormalize",
-        "reduce_results",
-        "mesh",
-        "use_ep",
-        "activation",
-    ),
-)
-def fused_moe_func_padded(
-    hidden_states: jax.Array,
-    w1: jax.Array,
-    w2: jax.Array,
-    w1_bias: jax.Array | None,
-    w2_bias: jax.Array | None,
-    gating_output: jax.Array,
-    topk: int,
-    global_num_experts: int,
-    renormalize: bool,
-    reduce_results: bool,
-    mesh: Mesh,
-    use_ep: bool,
-    activation: str,
-):
-    # TODO(fanhongmin@google.com): Once the jax runner pads the input, we no longer need this.
-    hidden_size = hidden_states.shape[-1]
-    num_tokens = hidden_states.size // hidden_size
-    if num_tokens * topk < 16:
-        assert 16 % (num_tokens *
-                     topk) == 0, f"Cannot pad to 16: {num_tokens=}, {topk=}"
-        n_repeats = 16 // (num_tokens * topk)
-        reps = (n_repeats, ) + (1, ) * (hidden_states.ndim - 1)
-        expanded_hidden_states = jnp.tile(hidden_states, reps)
-        reps = (n_repeats, ) + (1, ) * (gating_output.ndim - 1)
-        expanded_gating_output = jnp.tile(gating_output, reps)
-        expanded_x = fused_moe_func(
-            expanded_hidden_states,
-            w1,
-            w2,
-            w1_bias,
-            w2_bias,
-            expanded_gating_output,
-            topk,
-            global_num_experts,
-            renormalize,
-            reduce_results,
-            mesh,
-            use_ep,
-            activation,
-        )
-        x = expanded_x[:hidden_states.shape[0]]
-        return x
-    else:
-        return fused_moe_func(
-            hidden_states,
-            w1,
-            w2,
-            w1_bias,
-            w2_bias,
-            gating_output,
-            topk,
-            global_num_experts,
-            renormalize,
-            reduce_results,
-            mesh,
-            use_ep,
-            activation,
-        )
+    return x[:num_tokens, :hidden_size]

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.2rc3__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.2rc3py3-none-any.whl