PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511270815__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (251) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +22 -1
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +167 -97
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +31 -9
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +280 -210
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +77 -36
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +91 -31
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -71
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +158 -63
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +53 -30
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +105 -57
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +65 -19
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +65 -52
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511270815.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tpu_inference/kernels/fused_moe/v1/kernel.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """TPU-Friendly Fused Mixture of Experts (MoE) kernel."""
 import functools
@@ -19,7 +32,8 @@ def align_to(x, a):
 def get_dtype_packing(dtype):
-    bits = dtypes.bit_width(dtype)
+    bits = (dtypes.bit_width(dtype)
+            if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
     return 32 // bits
@@ -65,18 +79,19 @@ def ref_moe(
         top_k: int,
         *,
         renormalize_topk_logits: bool = False,
-        activation="silu",
+        act_fn: str = "silu",
         subc_quant_wsz: int | None = None,
         w1_scale:
     (
         jax.Array | None
-    ) = None,  # (num_experts, 2, cdiv(hidden_size, subc_quant_wsz), intermediate_size)
+    ) = None,  # F32(num_experts, 2, hidden_size //subc_quant_wsz, 1, intermediate_size)
         w2_scale:
     (
         jax.Array | None
-    ) = None,  # (num_experts, cdiv(intermediate_size, subc_quant_wsz), hidden_size)
-        b1: jax.Array | None = None,  # (num_experts, 2, intermediate_size)
-        b2: jax.Array | None = None,  # (num_experts, hidden_size)
+    ) = None,  # F32(num_experts, intermediate_size // subc_quant_wsz, 1, hidden_size)
+        b1: jax.Array
+    | None = None,  # F32(num_experts, 2, 1, intermediate_size)
+        b2: jax.Array | None = None,  # F32(num_experts, 1, hidden_size)
 ):
     n_tokens = tokens.shape[0]  # num_tokens
@@ -97,7 +112,7 @@ def ref_moe(
     # Process each token individually
     for i in range(n_tokens):
-        curr_token = jnp.expand_dims(tokens[i], axis=0)  # [1, d_model]
+        curr_token = jnp.expand_dims(tokens[i], axis=0)  # [1, hidden_size]
         assigned_expert_ids = top_k_indices[
             i]  # [top_k] - indices of selected experts for token i
         tok_expert_act = []
@@ -108,19 +123,19 @@ def ref_moe(
             expert_w1 = w1[expert_id, 0].astype(jnp.float32)
             expert_w3 = w1[expert_id, 1].astype(jnp.float32)
             if w1_scale is not None:
-                expert_w1 *= jnp.repeat(w1_scale[expert_id, 0],
+                expert_w1 *= jnp.repeat(w1_scale[expert_id, 0, :, 0],
                                         subc_quant_wsz,
                                         axis=0)[:hidden_size]
-                expert_w3 *= jnp.repeat(w1_scale[expert_id, 1],
+                expert_w3 *= jnp.repeat(w1_scale[expert_id, 1, :, 0],
                                         subc_quant_wsz,
                                         axis=0)[:hidden_size]
             expert_weight_1 = jnp.concat(
                 [expert_w1, expert_w3],
-                axis=-1)  # [d_model, 2 * intermediate_size]
+                axis=-1)  # [hidden_size, 2 * intermediate_size]
             expert_weight_2 = w2[expert_id].astype(
-                jnp.float32)  # [intermediate_size, d_model]
+                jnp.float32)  # [intermediate_size, hidden_size]
             if w2_scale is not None:
-                expert_weight_2 *= jnp.repeat(w2_scale[expert_id],
+                expert_weight_2 *= jnp.repeat(w2_scale[expert_id, :, 0],
                                               subc_quant_wsz,
                                               axis=0)[:intermediate_size]
@@ -132,32 +147,33 @@ def ref_moe(
                 gmm_1_out, 2,
                 axis=-1)  # [1, intermediate_size], [1, intermediate_size]
             if b1 is not None:
-                gmm1_w1_proj += b1[expert_id:expert_id + 1, 0]
-                gmm1_w3_proj += b1[expert_id:expert_id + 1, 1]
+                gmm1_w1_proj += b1[expert_id:expert_id + 1, 0, 0]
+                gmm1_w3_proj += b1[expert_id:expert_id + 1, 1, 0]
             # Apply gated activation: activation(gate) * up
-            act = activation_fn(gmm1_w1_proj, gmm1_w3_proj, activation)
+            act = activation_fn(gmm1_w1_proj, gmm1_w3_proj, act_fn)
             # Second linear layer (down projection)
-            gmm_2_out = act @ expert_weight_2  # [1, d_model]
+            gmm_2_out = act @ expert_weight_2  # [1, hidden_size]
             if b2 is not None:
-                gmm_2_out += b2[expert_id:expert_id + 1]
+                gmm_2_out += b2[expert_id:expert_id + 1, 0]
             tok_expert_act.append(gmm_2_out)
         # Combine outputs from all selected experts
         experts_act = jnp.concatenate(tok_expert_act,
-                                      axis=0)  # [top_k, d_model]
+                                      axis=0)  # [top_k, hidden_size]
         # Weighted sum using top-k gating weights
         top_k_weights = top_k_logits[i]  # [top_k]
         top_k_weights = jnp.expand_dims(top_k_weights, axis=1)  # [top_k, 1]
         weighted_output = jnp.sum(experts_act * top_k_weights,
                                   axis=0,
-                                  keepdims=True)  # [1, d_model]
+                                  keepdims=True)  # [1, hidden_size]
         t_outputs.append(weighted_output.astype(tokens.dtype))
-    return jnp.concatenate(t_outputs, axis=0)  # [num_tokens, d_model]
+    return jnp.concatenate(t_outputs,
+                           axis=0)  # [actual_num_tokens, hidden_size]
 def _fused_ep_moe_kernel(
@@ -177,7 +193,7 @@ def _fused_ep_moe_kernel(
         # Output
     output_hbm,  # (local_num_tokens, hidden_size)
         # Scratch
-    t2e_routing_x2_smem,  # <bt_sem_id> (2, bt, padded_num_experts)
+    t2e_routing_x2_smem,  # <bt_sem_id> (2, bt, padded_top_k)
         d2e_count_x2_smem,  # <bt_sem_id> (2, num_devices, 1, padded_num_experts)
         expert_offsets_x2_smem,  # <bt_sem_id> (2, 2, padded_num_experts): for a2a_s and a2a_g
         expert_starts_x2_smem,  # <bt_sem_id> (2, 1, padded_num_experts)
@@ -227,6 +243,11 @@ def _fused_ep_moe_kernel(
     local_num_tokens = tokens_hbm.shape[0]
     local_num_experts, intermediate_size, hidden_size = w2_hbm.shape
     right_id = (my_id + 1) % num_devices
+    num_experts = a2a_g_hbm.shape[0]
+    padded_num_experts = d2e_count_x2_smem.shape[-1]
+    padded_top_k = t2e_routing_x2_smem.shape[-1]
+    assert padded_num_experts == align_to(num_experts, 128)
+    assert padded_top_k == align_to(top_k, 128)
     t_dtype = tokens_hbm.dtype
     t_packing = get_dtype_packing(t_dtype)
@@ -300,35 +321,40 @@ def _fused_ep_moe_kernel(
     def get_top_k(input, top_k, renormalize_topk_logits):
         assert len(input.shape) == 2, input.shape
         input = input.astype(jnp.float32)
+        padded_k_shape = (input.shape[0], padded_top_k)
         top_k_logits_lst = []
         top_k_indices_lst = []
         t2e = jnp.zeros(input.shape, dtype=jnp.int32)
-        t2e_routing = jnp.zeros(input.shape, dtype=jnp.int32)
+        t2e_routing = jnp.zeros(padded_k_shape, dtype=jnp.int32)
         iota = jax.lax.broadcasted_iota(jnp.int32, input.shape, 1)
-        top_k_logits_sum = jnp.zeros((input.shape[0], 128), jnp.float32)
+        padded_k_iota = jax.lax.broadcasted_iota(jnp.int32, padded_k_shape, 1)
+        top_k_logits_sum = jnp.zeros(padded_k_shape, jnp.float32)
         for k_id in range(top_k):
             # TODO(jevinjiang): return both top_k values and indices in Mosaic
             top_k_logits = jnp.broadcast_to(
-                jnp.max(input, axis=1, keepdims=True),
-                (input.shape[0], 128)).astype(input.dtype)
+                jnp.max(input[:, :num_experts], axis=1, keepdims=True),
+                padded_k_shape,
+            ).astype(input.dtype)
+            top_k_logits_lst.append(top_k_logits)
             if renormalize_topk_logits:
                 top_k_logits_sum += top_k_logits
-            top_k_logits_lst.append(top_k_logits)
             # TODO(jevinjiang): support bf16 argmax in Mosaic
             top_k_indices = jnp.broadcast_to(
-                jnp.argmax(input, axis=1, keepdims=True), input.shape)
+                jnp.argmax(input[:, :num_experts], axis=1, keepdims=True),
+                padded_k_shape,
+            )
             top_k_indices_lst.append(top_k_indices)
-            t2e_routing = jnp.where(iota == k_id, top_k_indices, t2e_routing)
-            mask = iota == top_k_indices
+            t2e_routing = jnp.where(padded_k_iota == k_id, top_k_indices,
+                                    t2e_routing)
+            mask = iota == broadcast_minor(top_k_indices, input.shape)
             t2e += mask.astype(jnp.int32)
             if k_id != top_k - 1:
                 input = jnp.where(mask, -jnp.inf, input)
         if renormalize_topk_logits:
             for k_id in range(top_k):
-                top_k_logits_lst[
-                    k_id] = top_k_logits_lst[k_id] / top_k_logits_sum
+                top_k_logits_lst[k_id] /= top_k_logits_sum
         expert_sizes = jnp.sum(t2e, axis=0, keepdims=True)
         expert_starts = jnp.zeros_like(expert_sizes)
@@ -1071,27 +1097,38 @@ def _fused_ep_moe_kernel(
         all_reduce_metadata(bt_sem_id, t2e_routing, expert_starts,
                             expert_sizes)
+        sync_barrier()
+        # Start a2a scatter for first active expert.
         start_a2a_scatter(bt_id=bt_id, e_sem_id=e_sem_id, local_e_id=0)
         def run_per_expert(local_e_id, e_sem_id):
             sync_barrier()
+            # Prefetch weights for CURRENT active expert.
+            # TODO(jevinjiang): It is hard to prefetch weights in previous iteration
+            # because the expert_ffn keeps overwriting the buffers. Triple buffering
+            # could resolve this but it takes more VMEM scratch. Need further
+            # experiment on this.
+            start_fetch_bw1(local_e_id, bw1_sem_id=0, bf_id=0, bd1_id=0)
+            start_fetch_bw3(local_e_id, bw3_sem_id=0, bf_id=0, bd3_id=0)
+            # Next ids.
             next_e_sem_id = lax.select(e_sem_id == 0, 1, 0)
             next_local_e_id = local_e_id + 1
+            # Start a2a scatter for NEXT active expert.
             @pl.when(next_local_e_id < local_num_experts)
             def _():
                 start_a2a_scatter(bt_id, next_e_sem_id, next_local_e_id)
-            # Prefetch weights for active expert.
-            start_fetch_bw1(local_e_id, bw1_sem_id=0, bf_id=0, bd1_id=0)
-            start_fetch_bw3(local_e_id, bw3_sem_id=0, bf_id=0, bd3_id=0)
-            # Wait for a2a scatter and perform FFN for active expert.
+            # Wait a2a scatter for CURRENT active expert.
             wait_a2a_scatter_recv(bt_id, e_sem_id, local_e_id)
+            # Perform FFN for CURRENT active expert.
             expert_ffn(bt_id, e_sem_id, local_e_id)
-            # Wait for a2a gather to send back tokens for active expert.
+            # Start a2a gather to send back tokens for CURRENT active expert.
             start_a2a_gather(bt_id, e_sem_id, local_e_id)
             # A must-wait before next sync_barrier.
@@ -1104,7 +1141,10 @@ def _fused_ep_moe_kernel(
                                  e_sem_id,
                                  unroll=False)
+        # Wait to receive a2a gather for ALL experts.
         wait_a2a_gather_recv_all()
+        # Accumulate results for current batch.
         output = bt_acc(bt_id, top_k_logits_lst)
         # Make sure it is safe to overwrite output buffer.
@@ -1158,18 +1198,18 @@ def fused_ep_moe(
     w2: jax.Array,  # (num_experts, intermediate_size, hidden_size)
     gating_output: jax.Array,  # (num_tokens, num_experts)
     top_k: int,
+    *,
     renormalize_topk_logits: bool = False,
     act_fn: str = "silu",
-    *,
     subc_quant_wsz: int | None = None,
     w1_scale: (
         jax.Array | None
-    ) = None,  # (num_experts, 2, cdiv(hidden_size, subc_quant_wsz), intermediate_size)
+    ) = None,  # F32(num_experts, 2, hidden_size // subc_quant_wsz, 1, intermediate_size)
     w2_scale: (
         jax.Array | None
-    ) = None,  # (num_experts, cdiv(intermediate_size, subc_quant_wsz), hidden_size)
-    b1: jax.Array | None = None,  # (num_experts, 2, intermediate_size)
-    b2: jax.Array | None = None,  # (num_experts, hidden_size)
+    ) = None,  # F32(num_experts, intermediate_size // subc_quant_wsz, 1, hidden_size)
+    b1: jax.Array | None = None,  # F32(num_experts, 2, 1, intermediate_size)
+    b2: jax.Array | None = None,  # F32(num_experts, 1, hidden_size)
     # Kernel tuning parameters.
     bt: int,
     bf: int,
@@ -1182,75 +1222,159 @@ def fused_ep_moe(
     ep_axis_name: str = "model",
 ):
     # TODO(jevinjiang): move all these assertions to validation function.
-    # Assert all other axes have length of 1
-    assert len(mesh.shape) == 2, "Expect 2D mesh"
-    assert ("data" in mesh.shape
-            and mesh.shape["data"] == 1), "Expect data axis size of 1"
+    if len(mesh.shape) != 2:
+        raise NotImplementedError("Only 2D mesh is supported.")
+    for axis_name in mesh.axis_names:
+        if axis_name == ep_axis_name:
+            continue
+        if mesh.shape[axis_name] != 1:
+            raise NotImplementedError(
+                f"Expected all non-ep axis to have size 1 in {mesh.shape=}")
     ep_size = mesh.shape[ep_axis_name]
     num_devices = ep_size
-    num_tokens, actual_hidden_size = tokens.shape
-    num_experts, actual_intermediate_size, _ = w2.shape
+    num_tokens, hidden_size = tokens.shape
+    num_experts, intermediate_size, _ = w2.shape
+    if w1.shape != (num_experts, 2, hidden_size, intermediate_size):
+        raise ValueError(
+            f"Expected {w1.shape=} to be"
+            f" {(num_experts, 2, hidden_size, intermediate_size)}.")
+    if w2.shape != (num_experts, intermediate_size, hidden_size):
+        raise ValueError(f"Expected {w2.shape=} to be"
+                         f" {(num_experts, intermediate_size, hidden_size)}.")
-    assert num_tokens % ep_size == 0
-    assert num_experts % ep_size == 0
+    if gating_output.shape != (num_tokens, num_experts):
+        raise ValueError(
+            f"Expected {gating_output.shape=} to be {(num_tokens, num_experts)}."
+        )
+    if not (0 < top_k <= num_experts):
+        raise ValueError(
+            f"Expected {top_k=} to be in range (0, {num_experts=}].")
+    if hidden_size % 128 != 0 or intermediate_size % 128 != 0:
+        raise ValueError(
+            f"Expected {hidden_size=} and {intermediate_size=} to be aligned to"
+            " 128. Did you pad them with zeros outside the kernel?")
+    if num_tokens % ep_size != 0:
+        raise ValueError(
+            f"Expected {num_tokens=} to be aligned to {ep_size=}.")
+    if num_experts % ep_size != 0:
+        raise ValueError(
+            f"Expected {num_experts=} to be aligned to {ep_size=}.")
     local_num_tokens = num_tokens // ep_size
     # local_num_experts = num_experts // ep_size
     padded_num_experts = align_to(num_experts, 128)
+    padded_top_k = align_to(top_k, 128)
     t_dtype = tokens.dtype
     t_packing = get_dtype_packing(t_dtype)
+    # Override bt
+    if local_num_tokens <= t_packing * 8:
+        bt = local_num_tokens
+        btc = bt
+    bt = min(local_num_tokens, bt)
+    # The worst case is that all devices send bt to one device.
+    btc = min(bt, btc, bt * num_devices)
+    if local_num_tokens % t_packing != 0:
+        raise ValueError(
+            f"Expected {local_num_tokens=} to be aligned to {t_packing=}.")
+    if bt % t_packing != 0:
+        raise ValueError(f"Expected {bt=} to be aligned to {t_packing=}.")
+    if local_num_tokens % bt != 0:
+        raise ValueError(
+            f"Expected {local_num_tokens=} to be aligned to {bt=}.")
     if subc_quant_wsz is not None:
+        if subc_quant_wsz <= 0:
+            raise ValueError(f"Expected {subc_quant_wsz=} to be non-negative.")
         if subc_quant_wsz % 256 != 0:
-            raise NotImplementedError(
-                "Sub-quantized window is not aligned to 256.")
-        # We force compute size of contracting dim to subc_quant_wsz. So we can
+            raise ValueError(
+                "Expected {subc_quant_wsz=} to be aligned to 256.")
+        if hidden_size % subc_quant_wsz != 0:
+            raise ValueError(
+                f"Expected {hidden_size=} to be aligned to {subc_quant_wsz=}.")
+        if intermediate_size % subc_quant_wsz != 0:
+            raise ValueError(
+                f"Expected {intermediate_size=} to be aligned to {subc_quant_wsz=}."
+            )
+        # We force compute size of contracting dim to be subc_quant_wsz. So we can
         # apply same scale after matmul and accumulation.
         bd1c = subc_quant_wsz * t_packing
         bfc = subc_quant_wsz
-    assert bfc % 128 == 0
-    assert bd1c % (t_packing * 128) == 0
-    assert bd2c % (t_packing * 128) == 0
-    assert bf % bfc == 0
-    assert bd1 % bd1c == 0
-    assert bd2 % bd2c == 0
-    btc = min(btc, bt * num_devices)
-    hidden_size = align_to(actual_hidden_size, 128 * t_packing)
-    # TODO(jevinjiang): instead of padding outside the kernel, we can try dynammic
-    # masking inside the kernel.
-    hidden_size = align_to(hidden_size, bd1)
-    hidden_size = align_to(hidden_size, bd2)
-    intermediate_size = align_to(actual_intermediate_size, bf)
-    # TODO(jevinjiang): we should dump scale as the kernel expected shape in the
+    if bfc % 128 != 0:
+        raise ValueError(f"Expected {bfc=} to be aligned to 128.")
+    if bd1c % (t_packing * 128) != 0:
+        raise ValueError(
+            f"Expected {bd1c=} to be aligned to {t_packing * 128}.")
+    if bd2c % (t_packing * 128) != 0:
+        raise ValueError(
+            f"Expected {bd2c=} to be aligned to {t_packing * 128}.")
+    if bf % bfc != 0:
+        raise ValueError(f"Expected {bf=} to be aligned to {bfc=}.")
+    if bd1 % bd1c != 0:
+        raise ValueError(f"Expected {bd1=} to be aligned to {bd1c=}.")
+    if bd2 % bd2c != 0:
+        raise ValueError(f"Expected {bd2=} to be aligned to {bd2c=}.")
+    if hidden_size % bd1 != 0 or hidden_size % bd2 != 0:
+        raise ValueError(
+            f"Expected {hidden_size=} to be aligned to {bd1=} and {bd2=}.")
+    if intermediate_size % bf != 0:
+        raise ValueError(
+            f"Expected {intermediate_size=} to be aligned to {bf=}.")
+    # Note: we should dump scale as the kernel expected shape in the
     # checkpoint offline or reshape right after weight loading.
     if w1_scale is not None:
-        assert w1_scale.shape[0] == w1.shape[0]
-        assert w1_scale.shape[1] == w1.shape[1] == 2
-        assert w1_scale.shape[2] == cdiv(w1.shape[2], subc_quant_wsz)
-        assert w1_scale.shape[3] == w1.shape[3]
-        w1_scale = jnp.expand_dims(w1_scale.astype(jnp.float32), axis=-2)
+        expected_w1_scale_shape = (
+            num_experts,
+            2,
+            hidden_size // subc_quant_wsz,
+            1,
+            intermediate_size,
+        )
+        if w1_scale.shape != expected_w1_scale_shape:
+            raise ValueError(
+                f"Expected {w1_scale.shape=} to be {expected_w1_scale_shape}.")
+        if w1_scale.dtype != jnp.float32:
+            w1_scale = w1_scale.astype(jnp.float32)
     if w2_scale is not None:
-        assert w2_scale.shape[0] == w2.shape[0]
-        assert w2_scale.shape[1] == cdiv(w2.shape[1], subc_quant_wsz)
-        assert w2_scale.shape[2] == w2.shape[2]
-        w2_scale = jnp.expand_dims(w2_scale.astype(jnp.float32), axis=-2)
+        expected_w2_scale_shape = (
+            num_experts,
+            intermediate_size // subc_quant_wsz,
+            1,
+            hidden_size,
+        )
+        if w2_scale.shape != expected_w2_scale_shape:
+            raise ValueError(
+                f"Expected {w2_scale.shape=} to be {expected_w2_scale_shape}.")
+        if w2_scale.dtype != jnp.float32:
+            w2_scale = w2_scale.astype(jnp.float32)
     if b1 is not None:
-        assert b1.shape[0] == w1.shape[0]
-        assert b1.shape[1] == w1.shape[1] == 2
-        assert b1.shape[2] == w1.shape[3]
-        b1 = jnp.expand_dims(b1.astype(jnp.float32), axis=-2)
+        expected_b1_shape = (num_experts, 2, 1, intermediate_size)
+        if b1.shape != expected_b1_shape:
+            raise ValueError(
+                f"Expected {b1.shape=} to be {expected_b1_shape}.")
+        if b1.dtype != jnp.float32:
+            b1 = b1.astype(jnp.float32)
     if b2 is not None:
-        assert b2.shape[0] == w2.shape[0]
-        assert b2.shape[1] == w2.shape[2]
-        b2 = jnp.expand_dims(b2.astype(jnp.float32), axis=-2)
+        expected_b2_shape = (num_experts, 1, hidden_size)
+        if b2.shape != expected_b2_shape:
+            raise ValueError(
+                f"Expected {b2.shape=} to be {expected_b2_shape}.")
+        if b2.dtype != jnp.float32:
+            b2 = b2.astype(jnp.float32)
     # Prepare inputs for the kernel.
     if padded_num_experts != gating_output.shape[-1]:
@@ -1260,248 +1384,171 @@ def fused_ep_moe(
             constant_values=-jnp.inf,
         )
-    if (hidden_size != actual_hidden_size
-            or intermediate_size != actual_intermediate_size):
-        tokens = jnp.pad(
-            tokens,
-            ((0, 0), (0, hidden_size - actual_hidden_size)),
-            constant_values=0,
-        )
-        w1 = jnp.pad(
-            w1,
-            (
-                (0, 0),
-                (0, 0),
-                (0, hidden_size - actual_hidden_size),
-                (0, intermediate_size - actual_intermediate_size),
-            ),
-            constant_values=0,
-        )
-        w2 = jnp.pad(
-            w2,
-            (
-                (0, 0),
-                (0, intermediate_size - actual_intermediate_size),
-                (0, hidden_size - actual_hidden_size),
-            ),
-            constant_values=0,
-        )
-        if w1_scale is not None:
-            w1_scale = jnp.pad(
-                w1_scale,
-                (
-                    (0, 0),
-                    (0, 0),
-                    (0,
-                     cdiv(hidden_size, subc_quant_wsz) - w1_scale.shape[-3]),
-                    (0, 0),
-                    (0, intermediate_size - w1_scale.shape[-1]),
-                ),
-                constant_values=0,
-            )
-        if w2_scale is not None:
-            w2_scale = jnp.pad(
-                w2_scale,
-                (
-                    (0, 0),
-                    (0, cdiv(intermediate_size, subc_quant_wsz) -
-                     w2_scale.shape[-3]),
-                    (0, 0),
-                    (0, hidden_size - w2_scale.shape[-1]),
-                ),
-                constant_values=0,
-            )
-        if b1 is not None:
-            b1 = jnp.pad(
-                b1,
-                (
-                    (0, 0),
-                    (0, 0),
-                    (0, 0),
-                    (0, intermediate_size - b1.shape[-1]),
-                ),
-                constant_values=0,
-            )
-        if b2 is not None:
-            b2 = jnp.pad(
-                b2,
-                (
-                    (0, 0),
-                    (0, 0),
-                    (0, hidden_size - b2.shape[-1]),
-                ),
-                constant_values=0,
-            )
     tokens = tokens.reshape(-1, t_packing, hidden_size // t_packing)
     hbm_block_spec = pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM)
-    scope_name = f"fused_moe_k-{top_k}_renorm-{renormalize_topk_logits}_bt-{bt}-{btc}_bf-{bf}-{bfc}_bd1-{bd1}-{bd1c}_bd2-{bd2}-{bd2c}"
-    fused_moe = jax.named_scope(scope_name)(
-        pl.pallas_call(
-            functools.partial(
-                _fused_ep_moe_kernel,
-                top_k=top_k,
-                renormalize_topk_logits=renormalize_topk_logits,
-                ep_axis_name=ep_axis_name,
-                act_fn=act_fn,
-                subc_quant_wsz=subc_quant_wsz,
-                bt=bt,
-                bf=bf,
-                bd1=bd1,
-                bd2=bd2,
-                btc=btc,
-                bfc=bfc,
-                bd1c=bd1c,
-                bd2c=bd2c,
-            ),
-            out_shape=jax.ShapeDtypeStruct((local_num_tokens, hidden_size),
-                                           t_dtype),
-            grid_spec=pltpu.PrefetchScalarGridSpec(
-                num_scalar_prefetch=0,
-                in_specs=[
-                    hbm_block_spec,  # tokens_hbm
-                    hbm_block_spec,  # w1_hbm
-                    hbm_block_spec,  # w2_hbm
-                    None
-                    if w1_scale is None else hbm_block_spec,  # w1_scale_hbm
-                    None
-                    if w2_scale is None else hbm_block_spec,  # w2_scale_hbm
-                    None if b1 is None else hbm_block_spec,  # b1_hbm
-                    None if b2 is None else hbm_block_spec,  # b2_hbm
-                    hbm_block_spec,  # gating_output_hbm
-                    hbm_block_spec,  # a2a_g_hbm
-                ],
-                out_specs=pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
-                scratch_shapes=([
-                    # t2e_routing_x2_smem
-                    pltpu.SMEM((2, bt, padded_num_experts), jnp.int32),
-                    # d2e_count_x2_smem
-                    pltpu.SMEM((2, num_devices, 1, padded_num_experts),
-                               jnp.int32),
-                    # expert_offsets_x2_smem
-                    pltpu.SMEM((2, 2, padded_num_experts), jnp.int32),
-                    # expert_starts_x2_smem
-                    pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
-                    # expert_sizes_x2_smem
-                    pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
-                    # a2a_s_sends_x2_smem
-                    pltpu.SMEM((2, ), jnp.int32),
-                    # a2a_s_x2_vmem
-                    pltpu.VMEM(
-                        (
-                            2,
-                            bt * num_devices,
-                            t_packing,
-                            hidden_size // t_packing,
-                        ),
-                        t_dtype,
+    renorm_str = "-renorm_k" if renormalize_topk_logits else ""
+    scope_name = f"fused-moe-k_{top_k}{renorm_str}-bt_{bt}_{btc}-bf_{bf}_{bfc}-bd1_{bd1}_{bd1c}-bd2_{bd2}_{bd2c}"
+    fused_moe = pl.pallas_call(
+        functools.partial(
+            _fused_ep_moe_kernel,
+            top_k=top_k,
+            renormalize_topk_logits=renormalize_topk_logits,
+            ep_axis_name=ep_axis_name,
+            act_fn=act_fn,
+            subc_quant_wsz=subc_quant_wsz,
+            bt=bt,
+            bf=bf,
+            bd1=bd1,
+            bd2=bd2,
+            btc=btc,
+            bfc=bfc,
+            bd1c=bd1c,
+            bd2c=bd2c,
+        ),
+        out_shape=jax.ShapeDtypeStruct((local_num_tokens, hidden_size),
+                                       t_dtype),
+        grid_spec=pltpu.PrefetchScalarGridSpec(
+            num_scalar_prefetch=0,
+            in_specs=[
+                hbm_block_spec,  # tokens_hbm
+                hbm_block_spec,  # w1_hbm
+                hbm_block_spec,  # w2_hbm
+                None if w1_scale is None else hbm_block_spec,  # w1_scale_hbm
+                None if w2_scale is None else hbm_block_spec,  # w2_scale_hbm
+                None if b1 is None else hbm_block_spec,  # b1_hbm
+                None if b2 is None else hbm_block_spec,  # b2_hbm
+                hbm_block_spec,  # gating_output_hbm
+                hbm_block_spec,  # a2a_g_hbm
+            ],
+            out_specs=pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+            scratch_shapes=([
+                # t2e_routing_x2_smem
+                pltpu.SMEM((2, bt, padded_top_k), jnp.int32),
+                # d2e_count_x2_smem
+                pltpu.SMEM((2, num_devices, 1, padded_num_experts), jnp.int32),
+                # expert_offsets_x2_smem
+                pltpu.SMEM((2, 2, padded_num_experts), jnp.int32),
+                # expert_starts_x2_smem
+                pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
+                # expert_sizes_x2_smem
+                pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
+                # a2a_s_sends_x2_smem
+                pltpu.SMEM((2, ), jnp.int32),
+                # a2a_s_x2_vmem
+                pltpu.VMEM(
+                    (
+                        2,
+                        bt * num_devices,
+                        t_packing,
+                        hidden_size // t_packing,
                     ),
-                    # a2a_s_acc_x2_vmem
-                    pltpu.VMEM(
-                        (
-                            2,
-                            bt * num_devices,
-                            t_packing,
-                            hidden_size // t_packing,
-                        ),
-                        t_dtype,
+                    t_dtype,
+                ),
+                # a2a_s_acc_x2_vmem
+                pltpu.VMEM(
+                    (
+                        2,
+                        bt * num_devices,
+                        t_packing,
+                        hidden_size // t_packing,
                     ),
-                    # a2a_g_acc_vmem
-                    pltpu.VMEM(
-                        (top_k, bt, t_packing, hidden_size // t_packing),
-                        t_dtype),
-                    # b_gating_x2_vmem
-                    pltpu.VMEM((2, bt, padded_num_experts), t_dtype),
-                    # b_output_x2_vmem
-                    pltpu.VMEM((2, bt, hidden_size), t_dtype),
-                    # b_w1_x2_vmem
-                    pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
-                    # b_w3_x2_vmem
-                    pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
-                    # b_w2_x2_vmem
-                    pltpu.VMEM((2, t_packing, bf, bd2 // t_packing), w2.dtype),
-                    # b_w1_scale_x2_vmem
-                    (None if w1_scale is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            bd1 // t_packing // subc_quant_wsz,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_w3_scale_x2_vmem
-                    (None if w1_scale is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            bd1 // t_packing // subc_quant_wsz,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_w2_scale_x2_vmem
-                    (None if w2_scale is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            bf // subc_quant_wsz,
-                            1,
-                            bd2 // t_packing,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_b1_x2_vmem
-                    (None if b1 is None else pltpu.VMEM(
-                        (
-                            2,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_b3_x2_vmem
-                    (None if b1 is None else pltpu.VMEM(
-                        (
-                            2,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_b2_x2_vmem
-                    (None if b2 is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            1,
-                            bd2 // t_packing,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_acc_vmem
-                    pltpu.VMEM((bt * num_devices, 1, bf * 2), jnp.float32),
-                    # local_sems
-                    pltpu.SemaphoreType.DMA((2, 5)),
-                    # send_sems
-                    pltpu.SemaphoreType.DMA((2, )),
-                    # recv_sems
-                    pltpu.SemaphoreType.DMA((2, )),
-                    # a2a_gather_sem
-                    pltpu.SemaphoreType.DMA,
-                    # a2a_acc_sem
-                    pltpu.SemaphoreType.DMA,
-                ]),
-            ),
-            compiler_params=pltpu.CompilerParams(
-                collective_id=0,
-                vmem_limit_bytes=100 * 1024 * 1024,
-            ),
-            name=scope_name,
-        ))
+                    t_dtype,
+                ),
+                # a2a_g_acc_vmem
+                pltpu.VMEM((top_k, bt, t_packing, hidden_size // t_packing),
+                           t_dtype),
+                # b_gating_x2_vmem
+                pltpu.VMEM((2, bt, padded_num_experts), t_dtype),
+                # b_output_x2_vmem
+                pltpu.VMEM((2, bt, hidden_size), t_dtype),
+                # b_w1_x2_vmem
+                pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
+                # b_w3_x2_vmem
+                pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
+                # b_w2_x2_vmem
+                pltpu.VMEM((2, t_packing, bf, bd2 // t_packing), w2.dtype),
+                # b_w1_scale_x2_vmem
+                (None if w1_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bd1 // t_packing // subc_quant_wsz,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_w3_scale_x2_vmem
+                (None if w1_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bd1 // t_packing // subc_quant_wsz,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_w2_scale_x2_vmem
+                (None if w2_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bf // subc_quant_wsz,
+                        1,
+                        bd2 // t_packing,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b1_x2_vmem
+                (None if b1 is None else pltpu.VMEM(
+                    (
+                        2,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b3_x2_vmem
+                (None if b1 is None else pltpu.VMEM(
+                    (
+                        2,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b2_x2_vmem
+                (None if b2 is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        1,
+                        bd2 // t_packing,
+                    ),
+                    jnp.float32,
+                )),
+                # b_acc_vmem
+                pltpu.VMEM((bt * num_devices, 1, bf * 2), jnp.float32),
+                # local_sems
+                pltpu.SemaphoreType.DMA((2, 5)),
+                # send_sems
+                pltpu.SemaphoreType.DMA((2, )),
+                # recv_sems
+                pltpu.SemaphoreType.DMA((2, )),
+                # a2a_gather_sem
+                pltpu.SemaphoreType.DMA,
+                # a2a_acc_sem
+                pltpu.SemaphoreType.DMA,
+            ]),
+        ),
+        compiler_params=pltpu.CompilerParams(
+            collective_id=0,
+            vmem_limit_bytes=100 * 1024 * 1024,
+        ),
+        name=scope_name,
+    )
     @jax.jit
     @jax.shard_map(
@@ -1552,7 +1599,7 @@ def fused_ep_moe(
     a2a_g_hbm_scratch = pl.empty(
         (num_experts, bt, t_packing, hidden_size // t_packing), t_dtype)
-    results = kernel(
+    return kernel(
         tokens,
         w1,
         w2,
@@ -1563,4 +1610,3 @@ def fused_ep_moe(
         gating_output,
         a2a_g_hbm_scratch,
     )
-    return results[:, :actual_hidden_size]

tpu-inference 0.11.1.dev202511270815__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl