PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tpu_inference/kernels/fused_moe/v1/kernel.py ADDED Viewed

@@ -0,0 +1,1612 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TPU-Friendly Fused Mixture of Experts (MoE) kernel."""
+import functools
+import jax
+import jax.numpy as jnp
+from jax import lax
+from jax._src import dtypes
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import tpu as pltpu
+P = jax.sharding.PartitionSpec
+cdiv = pl.cdiv
+def align_to(x, a):
+    return cdiv(x, a) * a
+def get_dtype_packing(dtype):
+    bits = (dtypes.bit_width(dtype)
+            if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
+    return 32 // bits
+def broadcast_minor(src, shape):
+    if src.shape == shape:
+        return src
+    assert src.shape[:-1] == shape[:-1]
+    assert src.shape[-1] % 128 == 0
+    target_minor = align_to(shape[-1], src.shape[-1])
+    # no-op concatenation.
+    return jnp.concatenate([src for _ in range(target_minor // src.shape[-1])],
+                           axis=-1)[..., :shape[-1]]
+def swigluoai(gate: jax.Array,
+              up: jax.Array,
+              *,
+              alpha: float = 1.702,
+              limit: float = 7.0) -> jax.Array:
+    """Activation used in some models such as GPT-OSS."""
+    gate = jnp.clip(gate, a_max=limit)
+    up = jnp.clip(up, a_min=-limit, a_max=limit)
+    glu = gate * jax.nn.sigmoid(alpha * gate)
+    return (up + 1.0) * glu
+def activation_fn(acc1, acc3, act_fn):
+    if act_fn == "silu":
+        return jax.nn.silu(acc1) * acc3
+    elif act_fn == "gelu":
+        return jax.nn.gelu(acc1) * acc3
+    elif act_fn == "swigluoai":
+        return swigluoai(acc1, acc3)
+    else:
+        raise RuntimeError(f"Unsupported activation function: {act_fn}")
+def ref_moe(
+        tokens: jax.Array,  # (num_tokens, hidden_size)
+        w1: jax.Array,  # (num_experts, 2, hidden_size, intermediate_size)
+        w2: jax.Array,  # (num_experts, intermediate_size, hidden_size)
+        gating_output: jax.Array,  # (num_tokens, num_experts)
+        top_k: int,
+        *,
+        renormalize_topk_logits: bool = False,
+        act_fn: str = "silu",
+        subc_quant_wsz: int | None = None,
+        w1_scale:
+    (
+        jax.Array | None
+    ) = None,  # F32(num_experts, 2, hidden_size //subc_quant_wsz, 1, intermediate_size)
+        w2_scale:
+    (
+        jax.Array | None
+    ) = None,  # F32(num_experts, intermediate_size // subc_quant_wsz, 1, hidden_size)
+        b1: jax.Array
+    | None = None,  # F32(num_experts, 2, 1, intermediate_size)
+        b2: jax.Array | None = None,  # F32(num_experts, 1, hidden_size)
+):
+    n_tokens = tokens.shape[0]  # num_tokens
+    # Compute gating scores for all experts
+    gating_logits = jax.nn.softmax(gating_output,
+                                   axis=-1)  # [num_tokens, n_experts]
+    # Select top-k experts per token
+    top_k_logits, top_k_indices = lax.top_k(
+        gating_logits, top_k)  # [num_tokens, top_k], [num_tokens, top_k]
+    if renormalize_topk_logits:
+        top_k_logits = top_k_logits / jnp.sum(
+            top_k_logits, axis=-1, keepdims=True)
+    t_outputs = []
+    hidden_size, intermediate_size = w1.shape[-2:]
+    # Process each token individually
+    for i in range(n_tokens):
+        curr_token = jnp.expand_dims(tokens[i], axis=0)  # [1, hidden_size]
+        assigned_expert_ids = top_k_indices[
+            i]  # [top_k] - indices of selected experts for token i
+        tok_expert_act = []
+        # Process each selected expert for the current token
+        for expert_id in assigned_expert_ids:
+            # Get expert weights
+            expert_w1 = w1[expert_id, 0].astype(jnp.float32)
+            expert_w3 = w1[expert_id, 1].astype(jnp.float32)
+            if w1_scale is not None:
+                expert_w1 *= jnp.repeat(w1_scale[expert_id, 0, :, 0],
+                                        subc_quant_wsz,
+                                        axis=0)[:hidden_size]
+                expert_w3 *= jnp.repeat(w1_scale[expert_id, 1, :, 0],
+                                        subc_quant_wsz,
+                                        axis=0)[:hidden_size]
+            expert_weight_1 = jnp.concat(
+                [expert_w1, expert_w3],
+                axis=-1)  # [hidden_size, 2 * intermediate_size]
+            expert_weight_2 = w2[expert_id].astype(
+                jnp.float32)  # [intermediate_size, hidden_size]
+            if w2_scale is not None:
+                expert_weight_2 *= jnp.repeat(w2_scale[expert_id, :, 0],
+                                              subc_quant_wsz,
+                                              axis=0)[:intermediate_size]
+            # First linear layer with SwiGLU activation
+            gmm_1_out = curr_token @ expert_weight_1  # [1, 2 * intermediate_size]
+            # Split into gate and up projections for SwiGLU
+            gmm1_w1_proj, gmm1_w3_proj = jnp.split(
+                gmm_1_out, 2,
+                axis=-1)  # [1, intermediate_size], [1, intermediate_size]
+            if b1 is not None:
+                gmm1_w1_proj += b1[expert_id:expert_id + 1, 0, 0]
+                gmm1_w3_proj += b1[expert_id:expert_id + 1, 1, 0]
+            # Apply gated activation: activation(gate) * up
+            act = activation_fn(gmm1_w1_proj, gmm1_w3_proj, act_fn)
+            # Second linear layer (down projection)
+            gmm_2_out = act @ expert_weight_2  # [1, hidden_size]
+            if b2 is not None:
+                gmm_2_out += b2[expert_id:expert_id + 1, 0]
+            tok_expert_act.append(gmm_2_out)
+        # Combine outputs from all selected experts
+        experts_act = jnp.concatenate(tok_expert_act,
+                                      axis=0)  # [top_k, hidden_size]
+        # Weighted sum using top-k gating weights
+        top_k_weights = top_k_logits[i]  # [top_k]
+        top_k_weights = jnp.expand_dims(top_k_weights, axis=1)  # [top_k, 1]
+        weighted_output = jnp.sum(experts_act * top_k_weights,
+                                  axis=0,
+                                  keepdims=True)  # [1, hidden_size]
+        t_outputs.append(weighted_output.astype(tokens.dtype))
+    return jnp.concatenate(t_outputs,
+                           axis=0)  # [actual_num_tokens, hidden_size]
+def _fused_ep_moe_kernel(
+        # Input
+        tokens_hbm,  # (local_num_tokens, t_packing, hidden_size // t_packing)
+        w1_hbm,  # (local_num_experts, 2, hidden_size, intermediate_size)
+        w2_hbm,  # (local_num_experts, intermediate_size, hidden_size)
+        # TODO(jevinjiang): We choose F32 scale for easier slicing. The extra
+        # latency should be hidden in the pipeline overlaping. But is there a better
+        # way to do this?
+    w1_scale_hbm,  # None | F32(local_num_experts, 2, cdiv(hidden_size, subc_quant_wsz), 1, intermediate_size)
+        w2_scale_hbm,  # None | F32(local_num_experts, cdiv(intermediate_size, subc_quant_wsz), 1, hidden_size)
+        b1_hbm,  # None | F32(local_num_experts, 2, 1, intermediate_size)
+        b2_hbm,  # None | F32(local_num_experts, 1, hidden_size)
+        gating_hbm,  # (local_num_tokens, padded_num_experts)
+        a2a_g_hbm,  # (num_experts, bt, t_packing, hidden_size // t_packing)
+        # Output
+    output_hbm,  # (local_num_tokens, hidden_size)
+        # Scratch
+    t2e_routing_x2_smem,  # <bt_sem_id> (2, bt, padded_top_k)
+        d2e_count_x2_smem,  # <bt_sem_id> (2, num_devices, 1, padded_num_experts)
+        expert_offsets_x2_smem,  # <bt_sem_id> (2, 2, padded_num_experts): for a2a_s and a2a_g
+        expert_starts_x2_smem,  # <bt_sem_id> (2, 1, padded_num_experts)
+        expert_sizes_x2_smem,  # <bt_sem_id> (2, 1, padded_num_experts)
+        a2a_s_sends_x2_smem,  # <e_sem_id> (2,)
+        a2a_s_x2_vmem,  # <e_sem_id> (2, bt * num_devices, t_packing, hidden_size // t_packing)
+        a2a_s_acc_x2_vmem,  # <e_sem_id> (2, bt * num_devices, t_packing, hidden_size // t_packing)
+        ### Accumulation for gathered tokens:
+    a2a_g_acc_vmem,  # (top_k, bt, t_packing, hidden_size // t_packing)
+        ### Expert weight double buffering:
+    b_gating_x2_vmem,  # <bt_sem_id> (2, bt, padded_num_experts)
+        b_output_x2_vmem,  # <bt_sem_id> (2, bt, hidden_size)
+        b_w1_x2_vmem,  # <bw_sem_id> (2, t_packing, bd1 // t_packing, bf)
+        b_w3_x2_vmem,  # <bw_sem_id> (2, t_packing, bd1 // t_packing, bf)
+        b_w2_x2_vmem,  # <bw_sem_id> (2, t_packing, bf, bd2 // t_packing)
+        b_w1_scale_x2_vmem,  # None | <bw_sem_id> (2, t_packing, bd1 // t_packing // subc_quant_wsz, 1, bf)
+        b_w3_scale_x2_vmem,  # None | <bw_sem_id> (2, t_packing, bd1 // t_packing // subc_quant_wsz, 1, bf)
+        b_w2_scale_x2_vmem,  # None | <bw_sem_id> (2, t_packing, bf // subc_quant_wsz, 1, bd2 // t_packing)
+        b_b1_x2_vmem,  # None | <bw_sem_id> (2, 1, bf)
+        b_b3_x2_vmem,  # None | <bw_sem_id> (2, 1, bf)
+        b_b2_x2_vmem,  # None | <bw_sem_id> (2, t_packing, 1, bd2 // t_packing)
+        b_acc_vmem,  # F32(bt * num_devices, 1, bf * 2)
+        ### Semaphores:
+    local_sems,  # (2, 5): 2 x [b_gating_sem, b_w1_sem, b_w2_sem, b_w3_sem, b_output_sem]
+        send_sems,  # <e_sem_id> (2,)
+        recv_sems,  # <e_sem_id> (2,)
+        a2a_gather_sem,
+        a2a_acc_sem,
+        *,
+        top_k: int,
+        renormalize_topk_logits: bool,
+        ep_axis_name: str,
+        act_fn: str,
+        subc_quant_wsz: int | None = None,
+        # Kernel tuning params.
+        bt: int,  # Block size of local_num_tokens.
+        bf: int,  # Block size of intermediate_size.
+        bd1: int,  # Block size of hidden_size in w1.
+        bd2: int,  # Block size of hidden_size in w2.
+        btc: int,  # Compute size of block tokens for active expert.
+        bfc: int,  # Compute size of block intermediate_size.
+        bd1c: int,  # Compute size of block hidden_size.
+        bd2c: int,  # Compute size of block hidden_size.
+):
+    my_id = lax.axis_index(ep_axis_name)
+    num_devices = lax.axis_size(ep_axis_name)
+    local_num_tokens = tokens_hbm.shape[0]
+    local_num_experts, intermediate_size, hidden_size = w2_hbm.shape
+    right_id = (my_id + 1) % num_devices
+    num_experts = a2a_g_hbm.shape[0]
+    padded_num_experts = d2e_count_x2_smem.shape[-1]
+    padded_top_k = t2e_routing_x2_smem.shape[-1]
+    assert padded_num_experts == align_to(num_experts, 128)
+    assert padded_top_k == align_to(top_k, 128)
+    t_dtype = tokens_hbm.dtype
+    t_packing = get_dtype_packing(t_dtype)
+    t_bitwidth = 32 // t_packing
+    assert a2a_g_hbm.dtype == t_dtype
+    assert w1_hbm.dtype == w2_hbm.dtype
+    assert bd1 % bd1c == 0
+    assert bd2 % bd2c == 0
+    assert bf % bfc == 0
+    assert hidden_size % t_packing == 0
+    assert bd1 % t_packing == 0
+    assert bd2 % t_packing == 0
+    assert bd1c % t_packing == 0
+    assert bd2c % t_packing == 0
+    h_per_t_packing = hidden_size // t_packing
+    assert tokens_hbm.shape[-1] == h_per_t_packing
+    bd1_per_t_packing = bd1 // t_packing
+    bd2_per_t_packing = bd2 // t_packing
+    bd1c_per_t_packing = bd1c // t_packing
+    bd2c_per_t_packing = bd2c // t_packing
+    if subc_quant_wsz is not None:
+        assert subc_quant_wsz % 256 == 0
+        assert bd1c_per_t_packing == subc_quant_wsz
+        assert bfc == subc_quant_wsz
+        assert bd1 % subc_quant_wsz == 0
+        assert bf % subc_quant_wsz == 0
+        assert bd1_per_t_packing % subc_quant_wsz == 0
+        assert h_per_t_packing % subc_quant_wsz == 0
+    num_bt = cdiv(local_num_tokens, bt)
+    num_bf = cdiv(intermediate_size, bf)
+    num_bd1 = cdiv(hidden_size, bd1)
+    num_bd2 = cdiv(hidden_size, bd2)
+    def get_mesh_device_id(ep_rank):
+        dp_rank = jax.lax.axis_index("data")
+        return (dp_rank, ep_rank)
+    def sync_barrier():
+        barrier_sem = pltpu.get_barrier_semaphore()
+        pltpu.semaphore_signal(
+            barrier_sem,
+            device_id=get_mesh_device_id(right_id),
+            device_id_type=pltpu.DeviceIdType.MESH,
+        )
+        pltpu.semaphore_wait(barrier_sem, 1)
+    def start_fetch_b_gating(bt_id, priority=0):
+        is_valid = jnp.logical_and(0 <= bt_id, bt_id < num_bt)
+        sz = pl.multiple_of(lax.select(is_valid, bt, 0), bt)
+        bt_sem_id = (bt_id + 2) % 2
+        b_gating_sem = local_sems.at[bt_sem_id, 0]
+        pltpu.make_async_copy(
+            src_ref=gating_hbm.at[pl.ds(bt_id * bt, sz)],
+            dst_ref=b_gating_x2_vmem.at[bt_sem_id, pl.ds(0, sz)],
+            sem=b_gating_sem,
+        ).start(priority=priority)
+    def wait_fetch_b_gating(bt_id):
+        bt_sem_id = bt_id % 2
+        b_gating_sem = local_sems.at[bt_sem_id, 0]
+        pltpu.make_async_copy(
+            src_ref=b_gating_x2_vmem.at[bt_sem_id],
+            dst_ref=b_gating_x2_vmem.at[bt_sem_id],
+            sem=b_gating_sem,
+        ).wait()
+    def get_top_k(input, top_k, renormalize_topk_logits):
+        assert len(input.shape) == 2, input.shape
+        input = input.astype(jnp.float32)
+        padded_k_shape = (input.shape[0], padded_top_k)
+        top_k_logits_lst = []
+        top_k_indices_lst = []
+        t2e = jnp.zeros(input.shape, dtype=jnp.int32)
+        t2e_routing = jnp.zeros(padded_k_shape, dtype=jnp.int32)
+        iota = jax.lax.broadcasted_iota(jnp.int32, input.shape, 1)
+        padded_k_iota = jax.lax.broadcasted_iota(jnp.int32, padded_k_shape, 1)
+        top_k_logits_sum = jnp.zeros(padded_k_shape, jnp.float32)
+        for k_id in range(top_k):
+            # TODO(jevinjiang): return both top_k values and indices in Mosaic
+            top_k_logits = jnp.broadcast_to(
+                jnp.max(input[:, :num_experts], axis=1, keepdims=True),
+                padded_k_shape,
+            ).astype(input.dtype)
+            top_k_logits_lst.append(top_k_logits)
+            if renormalize_topk_logits:
+                top_k_logits_sum += top_k_logits
+            # TODO(jevinjiang): support bf16 argmax in Mosaic
+            top_k_indices = jnp.broadcast_to(
+                jnp.argmax(input[:, :num_experts], axis=1, keepdims=True),
+                padded_k_shape,
+            )
+            top_k_indices_lst.append(top_k_indices)
+            t2e_routing = jnp.where(padded_k_iota == k_id, top_k_indices,
+                                    t2e_routing)
+            mask = iota == broadcast_minor(top_k_indices, input.shape)
+            t2e += mask.astype(jnp.int32)
+            if k_id != top_k - 1:
+                input = jnp.where(mask, -jnp.inf, input)
+        if renormalize_topk_logits:
+            for k_id in range(top_k):
+                top_k_logits_lst[k_id] /= top_k_logits_sum
+        expert_sizes = jnp.sum(t2e, axis=0, keepdims=True)
+        expert_starts = jnp.zeros_like(expert_sizes)
+        return top_k_logits_lst, t2e_routing, expert_sizes, expert_starts
+    def all_reduce_metadata(bt_sem_id, t2e_routing, starts, sizes):
+        send_sem = send_sems.at[0]
+        recv_sem = recv_sems.at[0]
+        # All-reduce to accumulate starts and sizes and transfer to SMEM.
+        def _all_reduce_metadata(
+            t2e_routing_vmem,
+            d2e_count_vmem,
+            offsets_vmem,
+            starts_vmem,
+            sizes_vmem,
+        ):
+            offsets_vmem[...] = jnp.zeros_like(offsets_vmem)
+            # TODO(jevinjiang): check how slow is VMEM -> SMEM.
+            offsets_copy = pltpu.async_copy(
+                src_ref=offsets_vmem,
+                dst_ref=expert_offsets_x2_smem.at[bt_sem_id],
+                sem=send_sem,
+            )
+            t2e_routing_vmem[...] = t2e_routing
+            t2e_routing_copy = pltpu.async_copy(
+                src_ref=t2e_routing_vmem,
+                dst_ref=t2e_routing_x2_smem.at[bt_sem_id],
+                sem=send_sem,
+            )
+            reduced_sizes = sizes
+            reduced_starts = starts
+            row_id = my_id
+            d2e_count_vmem[row_id] = sizes
+            for i in range(num_devices - 1):
+                sync_barrier()
+                # TODO(jevinjiang): we can use double buffering to improve AR if needed.
+                pltpu.async_remote_copy(
+                    src_ref=d2e_count_vmem.at[row_id],
+                    dst_ref=d2e_count_vmem.at[row_id],
+                    send_sem=send_sem,
+                    recv_sem=recv_sem,
+                    device_id=get_mesh_device_id(right_id),
+                    device_id_type=pltpu.DeviceIdType.MESH,
+                ).wait()
+                row_id = (row_id + num_devices - 1) % num_devices
+                new_sizes = d2e_count_vmem[row_id]
+                reduced_sizes += new_sizes
+                reduced_starts += lax.select(my_id > i, new_sizes,
+                                             jnp.zeros_like(new_sizes))
+            starts_vmem[...] = reduced_starts
+            sizes_vmem[...] = reduced_sizes
+            starts_copy = pltpu.async_copy(
+                src_ref=starts_vmem,
+                dst_ref=expert_starts_x2_smem.at[bt_sem_id],
+                sem=send_sem,
+            )
+            sizes_copy = pltpu.async_copy(
+                src_ref=sizes_vmem,
+                dst_ref=expert_sizes_x2_smem.at[bt_sem_id],
+                sem=send_sem,
+            )
+            # TODO(jevinjiang): if d2e_count is too big, we can store in HBM and fetch
+            # to SMEM partially.
+            d2e_count_copy = pltpu.async_copy(
+                src_ref=d2e_count_vmem,
+                dst_ref=d2e_count_x2_smem.at[bt_sem_id],
+                sem=send_sem,
+            )
+            t2e_routing_copy.wait()
+            d2e_count_copy.wait()
+            offsets_copy.wait()
+            starts_copy.wait()
+            sizes_copy.wait()
+        pl.run_scoped(
+            _all_reduce_metadata,
+            pltpu.VMEM(t2e_routing_x2_smem.shape[1:],
+                       t2e_routing_x2_smem.dtype),
+            pltpu.VMEM(d2e_count_x2_smem.shape[1:], d2e_count_x2_smem.dtype),
+            pltpu.VMEM(expert_offsets_x2_smem.shape[1:],
+                       expert_offsets_x2_smem.dtype),
+            pltpu.VMEM(expert_starts_x2_smem.shape[1:],
+                       expert_starts_x2_smem.dtype),
+            pltpu.VMEM(expert_sizes_x2_smem.shape[1:],
+                       expert_sizes_x2_smem.dtype),
+        )
+    def start_a2a_scatter(bt_id, e_sem_id, local_e_id):
+        bt_sem_id = bt_id % 2
+        # Counting the number of remote sends from the current device.
+        send_sz = 0
+        for bt_t_id in range(bt):
+            for k_id in range(top_k):
+                e_id = t2e_routing_x2_smem[bt_sem_id, bt_t_id, k_id]
+                is_active_expert = e_id % local_num_experts == local_e_id
+                recv_id = e_id // local_num_experts
+                offset = expert_offsets_x2_smem[bt_sem_id, 0, e_id]
+                sz = lax.select(is_active_expert, 1, 0)
+                is_local = recv_id == my_id
+                local_sz = lax.select(is_local, sz, 0)
+                remote_sz = lax.select(is_local, 0, sz)
+                send_sz += remote_sz
+                expert_offsets_x2_smem[bt_sem_id, 0,
+                                       e_id] = (offset + local_sz + remote_sz)
+                start = expert_starts_x2_smem[bt_sem_id, 0, e_id] + offset
+                t_id = bt * bt_id + bt_t_id
+                # TODO(jevinjiang): compare the perf when using branches.
+                pltpu.make_async_copy(
+                    src_ref=tokens_hbm.at[pl.ds(t_id, local_sz)],
+                    dst_ref=a2a_s_x2_vmem.at[e_sem_id,
+                                             pl.ds(start, local_sz)],
+                    sem=recv_sems.at[e_sem_id],
+                ).start()
+                pltpu.make_async_remote_copy(
+                    src_ref=tokens_hbm.at[pl.ds(t_id, remote_sz)],
+                    dst_ref=a2a_s_x2_vmem.at[e_sem_id,
+                                             pl.ds(start, remote_sz)],
+                    send_sem=send_sems.at[e_sem_id],
+                    recv_sem=recv_sems.at[e_sem_id],
+                    device_id=get_mesh_device_id(recv_id),
+                    device_id_type=pltpu.DeviceIdType.MESH,
+                ).start()
+        a2a_s_sends_x2_smem[e_sem_id] = send_sz
+    def wait_a2a_scatter_recv(bt_id, e_sem_id, local_e_id):
+        bt_sem_id = bt_id % 2
+        e_id = my_id * local_num_experts + local_e_id
+        sz = expert_sizes_x2_smem[bt_sem_id, 0, e_id]
+        pltpu.make_async_copy(
+            src_ref=a2a_s_x2_vmem.at[e_sem_id, pl.ds(0, sz)],
+            dst_ref=a2a_s_x2_vmem.at[e_sem_id, pl.ds(0, sz)],
+            sem=recv_sems.at[e_sem_id],
+        ).wait()
+    def wait_a2a_scatter_send(bt_id, e_sem_id, local_e_id):
+        del bt_id, local_e_id
+        sz = a2a_s_sends_x2_smem[e_sem_id]
+        pltpu.make_async_copy(
+            src_ref=a2a_s_x2_vmem.at[e_sem_id, pl.ds(0, sz)],
+            dst_ref=a2a_s_x2_vmem.at[e_sem_id, pl.ds(0, sz)],
+            sem=send_sems.at[e_sem_id],
+        ).wait()
+    def start_a2a_gather(bt_id, e_sem_id, local_e_id):
+        my_e_id = my_id * local_num_experts + local_e_id
+        bt_sem_id = bt_id % 2
+        start = 0
+        for recv_id in range(num_devices):
+            sz = d2e_count_x2_smem[bt_sem_id, recv_id, 0, my_e_id]
+            is_local = recv_id == my_id
+            local_sz = lax.select(is_local, sz, 0)
+            remote_sz = lax.select(is_local, 0, sz)
+            pltpu.make_async_copy(
+                src_ref=a2a_s_acc_x2_vmem.at[e_sem_id,
+                                             pl.ds(start, local_sz)],
+                dst_ref=a2a_g_hbm.at[my_e_id, pl.ds(0, local_sz)],
+                sem=a2a_gather_sem,
+            ).start()
+            pltpu.make_async_remote_copy(
+                src_ref=a2a_s_acc_x2_vmem.at[e_sem_id,
+                                             pl.ds(start, remote_sz)],
+                dst_ref=a2a_g_hbm.at[my_e_id, pl.ds(0, remote_sz)],
+                send_sem=send_sems.at[e_sem_id],
+                recv_sem=a2a_gather_sem,
+                device_id=get_mesh_device_id(recv_id),
+                device_id_type=pltpu.DeviceIdType.MESH,
+            ).start()
+            start += sz
+    def wait_a2a_gather_send(bt_id, e_sem_id, local_e_id):
+        my_e_id = my_id * local_num_experts + local_e_id
+        bt_sem_id = bt_id % 2
+        sz = expert_sizes_x2_smem[bt_sem_id, 0, my_e_id]
+        local_sz = d2e_count_x2_smem[bt_sem_id, my_id, 0, my_e_id]
+        remote_sz = sz - local_sz
+        is_valid = jnp.logical_and(0 <= local_e_id, local_e_id
+                                   < local_num_experts)
+        remote_sz = lax.select(is_valid, remote_sz, 0)
+        pltpu.make_async_copy(
+            src_ref=a2a_g_hbm.at[0, pl.ds(0, remote_sz)],
+            dst_ref=a2a_g_hbm.at[0, pl.ds(0, remote_sz)],
+            sem=send_sems.at[e_sem_id],
+        ).wait()
+    def wait_a2a_gather_recv_all():
+        sz = top_k * bt
+        pltpu.make_async_copy(
+            src_ref=a2a_g_hbm.at[0, pl.ds(0, sz)],
+            dst_ref=a2a_g_hbm.at[0, pl.ds(0, sz)],
+            sem=a2a_gather_sem,
+        ).wait()
+    def start_fetch_bw1(local_e_id, bw1_sem_id, bf_id, bd1_id):
+        for p in range(t_packing):
+            offset = p * h_per_t_packing + bd1_id * bd1_per_t_packing
+            pltpu.make_async_copy(
+                src_ref=w1_hbm.at[
+                    local_e_id,
+                    0,
+                    pl.ds(offset, bd1_per_t_packing),
+                    pl.ds(bf_id * bf, bf),
+                ],
+                dst_ref=b_w1_x2_vmem.at[bw1_sem_id, p],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).start()
+            if w1_scale_hbm is not None:
+                assert subc_quant_wsz is not None
+                pltpu.make_async_copy(
+                    src_ref=w1_scale_hbm.at[
+                        local_e_id,
+                        0,
+                        pl.ds(
+                            offset // subc_quant_wsz,
+                            bd1_per_t_packing // subc_quant_wsz,
+                        ),
+                        pl.ds(0, 1),
+                        pl.ds(bf_id * bf, bf),
+                    ],
+                    dst_ref=b_w1_scale_x2_vmem.at[bw1_sem_id, p],
+                    sem=local_sems.at[bw1_sem_id, 1],
+                ).start()
+        if b1_hbm is not None and bd1_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b1_hbm.at[local_e_id, 0,
+                                  pl.ds(0, 1),
+                                  pl.ds(bf_id * bf, bf)],
+                dst_ref=b_b1_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).start()
+    def start_fetch_bw2(local_e_id, bw2_sem_id, bf_id, bd2_id):
+        for p in range(t_packing):
+            offset = p * h_per_t_packing + bd2_id * bd2_per_t_packing
+            pltpu.make_async_copy(
+                src_ref=w2_hbm.at[
+                    local_e_id,
+                    pl.ds(bf_id * bf, bf),
+                    pl.ds(offset, bd2_per_t_packing),
+                ],
+                dst_ref=b_w2_x2_vmem.at[bw2_sem_id, p],
+                sem=local_sems.at[bw2_sem_id, 2],
+            ).start()
+            if w2_scale_hbm is not None:
+                assert subc_quant_wsz is not None
+                pltpu.make_async_copy(
+                    src_ref=w2_scale_hbm.at[
+                        local_e_id,
+                        pl.ds(bf_id * bf // subc_quant_wsz, bf //
+                              subc_quant_wsz),
+                        pl.ds(0, 1),
+                        pl.ds(offset, bd2_per_t_packing),
+                    ],
+                    dst_ref=b_w2_scale_x2_vmem.at[bw2_sem_id, p],
+                    sem=local_sems.at[bw2_sem_id, 2],
+                ).start()
+            if b2_hbm is not None and bf_id == 0:
+                pltpu.make_async_copy(
+                    src_ref=b2_hbm.at[local_e_id,
+                                      pl.ds(0, 1),
+                                      pl.ds(offset, bd2_per_t_packing)],
+                    dst_ref=b_b2_x2_vmem.at[bd2_id % 2, p],
+                    sem=local_sems.at[bw2_sem_id, 2],
+                ).start()
+    def start_fetch_bw3(local_e_id, bw3_sem_id, bf_id, bd3_id):
+        for p in range(t_packing):
+            offset = p * h_per_t_packing + bd3_id * bd1_per_t_packing
+            pltpu.make_async_copy(
+                src_ref=w1_hbm.at[
+                    local_e_id,
+                    1,
+                    pl.ds(offset, bd1_per_t_packing),
+                    pl.ds(bf_id * bf, bf),
+                ],
+                dst_ref=b_w3_x2_vmem.at[bw3_sem_id, p],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).start()
+            if w1_scale_hbm is not None:
+                assert subc_quant_wsz is not None
+                pltpu.make_async_copy(
+                    src_ref=w1_scale_hbm.at[
+                        local_e_id,
+                        1,
+                        pl.ds(
+                            offset // subc_quant_wsz,
+                            bd1_per_t_packing // subc_quant_wsz,
+                        ),
+                        pl.ds(0, 1),
+                        pl.ds(bf_id * bf, bf),
+                    ],
+                    dst_ref=b_w3_scale_x2_vmem.at[bw3_sem_id, p],
+                    sem=local_sems.at[bw3_sem_id, 3],
+                ).start()
+        if b1_hbm is not None and bd3_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b1_hbm.at[local_e_id, 1,
+                                  pl.ds(0, 1),
+                                  pl.ds(bf_id * bf, bf)],
+                dst_ref=b_b3_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).start()
+    def wait_fetch_bw1(local_e_id, bw1_sem_id, bf_id, bd1_id):
+        del local_e_id
+        pltpu.make_async_copy(
+            src_ref=b_w1_x2_vmem.at[bw1_sem_id],
+            dst_ref=b_w1_x2_vmem.at[bw1_sem_id],
+            sem=local_sems.at[bw1_sem_id, 1],
+        ).wait()
+        if w1_scale_hbm is not None:
+            pltpu.make_async_copy(
+                src_ref=b_w1_scale_x2_vmem.at[bw1_sem_id],
+                dst_ref=b_w1_scale_x2_vmem.at[bw1_sem_id],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).wait()
+        if b1_hbm is not None and bd1_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b_b1_x2_vmem.at[bf_id % 2],
+                dst_ref=b_b1_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).wait()
+    def wait_fetch_bw2(local_e_id, bw2_sem_id, bf_id, bd2_id):
+        del local_e_id
+        pltpu.make_async_copy(
+            src_ref=b_w2_x2_vmem.at[bw2_sem_id],
+            dst_ref=b_w2_x2_vmem.at[bw2_sem_id],
+            sem=local_sems.at[bw2_sem_id, 2],
+        ).wait()
+        if w2_scale_hbm is not None:
+            pltpu.make_async_copy(
+                src_ref=b_w2_scale_x2_vmem.at[bw2_sem_id],
+                dst_ref=b_w2_scale_x2_vmem.at[bw2_sem_id],
+                sem=local_sems.at[bw2_sem_id, 2],
+            ).wait()
+        if b2_hbm is not None and bf_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b_b2_x2_vmem.at[bd2_id % 2],
+                dst_ref=b_b2_x2_vmem.at[bd2_id % 2],
+                sem=local_sems.at[bw2_sem_id, 2],
+            ).wait()
+    def wait_fetch_bw3(local_e_id, bw3_sem_id, bf_id, bd3_id):
+        del local_e_id
+        pltpu.make_async_copy(
+            src_ref=b_w3_x2_vmem.at[bw3_sem_id],
+            dst_ref=b_w3_x2_vmem.at[bw3_sem_id],
+            sem=local_sems.at[bw3_sem_id, 3],
+        ).wait()
+        if w1_scale_hbm is not None:
+            pltpu.make_async_copy(
+                src_ref=b_w3_scale_x2_vmem.at[bw3_sem_id],
+                dst_ref=b_w3_scale_x2_vmem.at[bw3_sem_id],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).wait()
+        if b1_hbm is not None and bd3_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b_b3_x2_vmem.at[bf_id % 2],
+                dst_ref=b_b3_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).wait()
+    def start_fetch_next_bw(local_e_id, bw_sem_id, bf_id, bd1_id, bd2_id):
+        next_bd1_id = bd1_id + 1
+        next_bd2_id = bd2_id + 1
+        next_sem_id = (bw_sem_id + 1) % 2
+        if bf_id >= num_bf:
+            return
+        if next_bd1_id < num_bd1:
+            start_fetch_bw1(local_e_id, next_sem_id, bf_id, next_bd1_id)
+            start_fetch_bw3(local_e_id, next_sem_id, bf_id, next_bd1_id)
+        elif next_bd1_id == num_bd1:
+            start_fetch_bw2(local_e_id, next_sem_id, bf_id, 0)
+        elif next_bd2_id < num_bd2:
+            start_fetch_bw2(local_e_id, next_sem_id, bf_id, next_bd2_id)
+        elif next_bd2_id == num_bd2:
+            start_fetch_next_bw(local_e_id, bw_sem_id, bf_id + 1, -1, -1)
+        else:
+            raise RuntimeError("Unreachable")
+    def dynamic_ffn1(
+        t_b32_vmem,
+        w1_vmem,
+        w1_scale_vmem,
+        b1_vmem,
+        w3_vmem,
+        w3_scale_vmem,
+        b3_vmem,
+        acc1_vmem,
+        acc3_vmem,
+        dyn_sz,
+        should_init,
+    ):
+        assert t_b32_vmem.shape == (bt * num_devices, bd1 // t_packing)
+        assert w1_vmem.shape == w3_vmem.shape == (t_packing, bd1_per_t_packing,
+                                                  bf)
+        assert acc1_vmem.shape == acc3_vmem.shape == (bt * num_devices, bf)
+        assert bd1 % (t_packing * 128) == 0, (bd1, t_packing)
+        assert bd1c % (t_packing * 128) == 0, (bd1c, t_packing)
+        if w1_scale_vmem is not None:
+            assert w1_scale_vmem.shape == (
+                t_packing,
+                bd1_per_t_packing // subc_quant_wsz,
+                1,
+                bf,
+            )
+            assert bd1c_per_t_packing == subc_quant_wsz
+        if w3_scale_vmem is not None:
+            assert w3_scale_vmem.shape == (
+                t_packing,
+                bd1_per_t_packing // subc_quant_wsz,
+                1,
+                bf,
+            )
+            assert bd1c_per_t_packing == subc_quant_wsz
+        num_loops = cdiv(dyn_sz, btc)
+        repack_ty = jnp.dtype(f"int{t_bitwidth}")
+        def body(btc_id, _):
+            for bd1c_id in range(cdiv(bd1, bd1c)):
+                t_b32 = t_b32_vmem[
+                    pl.ds(btc_id * btc, btc),
+                    pl.ds(bd1c_id * bd1c_per_t_packing, bd1c_per_t_packing),
+                ]
+                for p_id in range(t_packing):
+                    t = pltpu.bitcast(t_b32.astype(repack_ty), t_dtype)
+                    t_b32 = t_b32 >> t_bitwidth
+                    for bfc_id in range(cdiv(bf, bfc)):
+                        w_slices = (
+                            p_id,
+                            pl.ds(bd1c_id * bd1c_per_t_packing,
+                                  bd1c_per_t_packing),
+                            pl.ds(bfc_id * bfc, bfc),
+                        )
+                        w1 = w1_vmem[*w_slices]
+                        acc1 = jnp.dot(t,
+                                       w1,
+                                       preferred_element_type=jnp.float32)
+                        if w1_scale_vmem is not None:
+                            w1_scale_slices = (
+                                p_id,
+                                bd1c_id,
+                                pl.ds(0, 1),
+                                pl.ds(bfc_id * bfc, bfc),
+                            )
+                            # TODO(jevinjiang): can use mosaic to load with stride 0.
+                            w1_scale = jnp.broadcast_to(
+                                w1_scale_vmem[*w1_scale_slices], acc1.shape)
+                            acc1 *= w1_scale
+                        w3 = w3_vmem[*w_slices]
+                        acc3 = jnp.dot(t,
+                                       w3,
+                                       preferred_element_type=jnp.float32)
+                        if w3_scale_vmem is not None:
+                            w3_scale_slices = (
+                                p_id,
+                                bd1c_id,
+                                pl.ds(0, 1),
+                                pl.ds(bfc_id * bfc, bfc),
+                            )
+                            w3_scale = jnp.broadcast_to(
+                                w3_scale_vmem[*w3_scale_slices], acc3.shape)
+                            acc3 *= w3_scale
+                        acc_slices = (pl.ds(btc_id * btc,
+                                            btc), pl.ds(bfc_id * bfc, bfc))
+                        if should_init and p_id == bd1c_id == 0:
+                            if b1_vmem is not None:
+                                b1_scale_slices = (
+                                    pl.ds(0, 1),
+                                    pl.ds(bfc_id * bfc, bfc),
+                                )
+                                b1 = jnp.broadcast_to(
+                                    b1_vmem[*b1_scale_slices], acc1.shape)
+                                acc1 += b1
+                            if b3_vmem is not None:
+                                b3_scale_slices = (
+                                    pl.ds(0, 1),
+                                    pl.ds(bfc_id * bfc, bfc),
+                                )
+                                b3 = jnp.broadcast_to(
+                                    b3_vmem[*b3_scale_slices], acc1.shape)
+                                acc3 += b3
+                            acc1_vmem[*acc_slices] = acc1
+                            acc3_vmem[*acc_slices] = acc3
+                        else:
+                            acc1_vmem[*acc_slices] += acc1
+                            acc3_vmem[*acc_slices] += acc3
+        lax.fori_loop(0, num_loops, body, None)
+    def dynamic_ffn2(
+        acc1_vmem,
+        acc3_vmem,
+        w2_vmem,
+        w2_scale_vmem,
+        b2_vmem,
+        res_b32_vmem,
+        dyn_sz,
+        should_init,
+    ):
+        assert res_b32_vmem.shape == (bt * num_devices, bd2_per_t_packing)
+        assert w2_vmem.shape == (t_packing, bf, bd2_per_t_packing)
+        assert acc1_vmem.shape == acc3_vmem.shape == (bt * num_devices, bf)
+        assert bd2 % (t_packing * 128) == 0, (bd2, t_packing)
+        assert bd2c % (t_packing * 128) == 0, (bd2c, t_packing)
+        assert t_dtype in (jnp.float32, jnp.bfloat16)
+        if w2_scale_vmem is not None:
+            assert w2_scale_vmem.shape == (
+                t_packing,
+                bf // subc_quant_wsz,
+                1,
+                bd2_per_t_packing,
+            )
+            assert bfc == subc_quant_wsz
+        num_loops = cdiv(dyn_sz, btc)
+        assert bd2c % (t_packing * 128) == 0, (bd2c, t_packing)
+        def body(btc_id, _):
+            for bd2c_id in range(cdiv(bd2, bd2c)):
+                res_lst = []
+                for p_id in range(t_packing):
+                    res = jnp.zeros((btc, bd2c_per_t_packing),
+                                    dtype=jnp.float32)
+                    if b2_vmem is not None and should_init:
+                        b2_scale_slices = (
+                            p_id,
+                            pl.ds(0, 1),
+                            pl.ds(bd2c_id * bd2c_per_t_packing,
+                                  bd2c_per_t_packing),
+                        )
+                        b2 = jnp.broadcast_to(b2_vmem[*b2_scale_slices],
+                                              res.shape)
+                        res += b2
+                    for bfc_id in range(cdiv(bf, bfc)):
+                        acc_slices = (pl.ds(btc_id * btc,
+                                            btc), pl.ds(bfc_id * bfc, bfc))
+                        acc1 = acc1_vmem[*acc_slices]
+                        acc3 = acc3_vmem[*acc_slices]
+                        act = activation_fn(acc1, acc3, act_fn)
+                        w2 = w2_vmem[
+                            p_id,
+                            pl.ds(bfc_id * bfc, bfc),
+                            pl.ds(bd2c_id *
+                                  bd2c_per_t_packing, bd2c_per_t_packing),
+                        ]
+                        acc = jnp.dot(act,
+                                      w2,
+                                      preferred_element_type=jnp.float32)
+                        if w2_scale_vmem is not None:
+                            w2_scale_slices = (
+                                p_id,
+                                bfc_id,
+                                pl.ds(0, 1),
+                                pl.ds(bd2c_id * bd2c_per_t_packing,
+                                      bd2c_per_t_packing),
+                            )
+                            w2_scale = jnp.broadcast_to(
+                                w2_scale_vmem[*w2_scale_slices], acc.shape)
+                            acc *= w2_scale
+                        res += acc
+                    res = pltpu.bitcast(res, jnp.uint32)
+                    if t_packing == 2:
+                        res = res >> 16 << (16 * p_id)
+                    else:
+                        assert t_packing == 1
+                    res_lst.append(res)
+                res = res_lst[0]
+                # TODO(jevinjiang): use interleaved packing when it is exposed to Pallas
+                for i in range(1, t_packing):
+                    res |= res_lst[i]
+                sliced_res_vmem = res_b32_vmem.at[
+                    pl.ds(btc_id * btc, btc),
+                    pl.ds(bd2c_id * bd2c_per_t_packing, bd2c_per_t_packing),
+                ]
+                if should_init:
+                    sliced_res_vmem[...] = res
+                else:
+                    sliced_res_vmem[...] = pltpu.bitcast(
+                        sliced_res_vmem.bitcast(t_dtype)[...] +
+                        pltpu.bitcast(res, t_dtype),
+                        sliced_res_vmem.dtype,
+                    )
+        lax.fori_loop(0, num_loops, body, None)
+    def expert_ffn(bt_id, e_sem_id, local_e_id):
+        bt_sem_id = bt_id % 2
+        bw_sem_id = 0
+        # start_fetch_bw1(local_e_id, bw_sem_id, 0, 0)
+        # start_fetch_bw3(local_e_id, bw_sem_id, 0, 0)
+        a2a_s_b32_vmem = (a2a_s_x2_vmem.bitcast(jnp.uint32).reshape(
+            2, bt * num_devices, hidden_size // t_packing).at[e_sem_id])
+        a2a_s_acc_b32_vmem = (a2a_s_acc_x2_vmem.bitcast(jnp.uint32).reshape(
+            2, bt * num_devices, hidden_size // t_packing).at[e_sem_id])
+        b_acc_vmem_2d = b_acc_vmem.reshape(bt * num_devices, bf * 2)
+        b_acc1_vmem = b_acc_vmem_2d.at[:, :bf]
+        b_acc3_vmem = b_acc_vmem_2d.at[:, bf:]
+        e_id = my_id * local_num_experts + local_e_id
+        dyn_sz = expert_sizes_x2_smem[bt_sem_id, 0, e_id]
+        bd1_per_t_packing = bd1 // t_packing
+        bd2_per_t_packing = bd2 // t_packing
+        for bf_id in range(num_bf):
+            for bd1_id in range(num_bd1):
+                start_fetch_next_bw(local_e_id, bw_sem_id, bf_id, bd1_id, 0)
+                w1_scale_vmem = (None if b_w1_scale_x2_vmem is None else
+                                 b_w1_scale_x2_vmem.at[bw_sem_id])
+                w3_scale_vmem = (None if b_w3_scale_x2_vmem is None else
+                                 b_w3_scale_x2_vmem.at[bw_sem_id])
+                b1_vmem = None if b_b1_x2_vmem is None else b_b1_x2_vmem.at[
+                    bf_id % 2]
+                b3_vmem = None if b_b3_x2_vmem is None else b_b3_x2_vmem.at[
+                    bf_id % 2]
+                wait_fetch_bw1(local_e_id, bw_sem_id, bf_id, bd1_id)
+                wait_fetch_bw3(local_e_id, bw_sem_id, bf_id, bd1_id)
+                dynamic_ffn1(
+                    t_b32_vmem=a2a_s_b32_vmem.at[
+                        ...,
+                        pl.ds(bd1_id * bd1_per_t_packing, bd1_per_t_packing)],
+                    w1_vmem=b_w1_x2_vmem.at[bw_sem_id],
+                    w1_scale_vmem=w1_scale_vmem,
+                    b1_vmem=b1_vmem,
+                    w3_vmem=b_w3_x2_vmem.at[bw_sem_id],
+                    w3_scale_vmem=w3_scale_vmem,
+                    b3_vmem=b3_vmem,
+                    acc1_vmem=b_acc1_vmem,
+                    acc3_vmem=b_acc3_vmem,
+                    dyn_sz=dyn_sz,
+                    should_init=(bd1_id == 0),
+                )
+                bw_sem_id = (bw_sem_id + 1) % 2
+            for bd2_id in range(num_bd2):
+                start_fetch_next_bw(local_e_id, bw_sem_id, bf_id, num_bd1,
+                                    bd2_id)
+                wait_fetch_bw2(local_e_id, bw_sem_id, bf_id, bd2_id)
+                if bf_id == bd2_id == 0:
+                    wait_a2a_gather_send(bt_id, e_sem_id, local_e_id - 2)
+                w2_scale_vmem = (None if b_w2_scale_x2_vmem is None else
+                                 b_w2_scale_x2_vmem.at[bw_sem_id])
+                b2_vmem = None if b_b2_x2_vmem is None else b_b2_x2_vmem.at[
+                    bd2_id % 2]
+                dynamic_ffn2(
+                    acc1_vmem=b_acc1_vmem,
+                    acc3_vmem=b_acc3_vmem,
+                    w2_vmem=b_w2_x2_vmem.at[bw_sem_id],
+                    w2_scale_vmem=w2_scale_vmem,
+                    b2_vmem=b2_vmem,
+                    res_b32_vmem=a2a_s_acc_b32_vmem.at[
+                        ...,
+                        pl.ds(bd2_id * bd2_per_t_packing, bd2_per_t_packing)],
+                    dyn_sz=dyn_sz,
+                    should_init=(bf_id == 0),
+                )
+                bw_sem_id = (bw_sem_id + 1) % 2
+    def bt_acc(bt_id, top_k_logits_lst):
+        bt_sem_id = bt_id % 2
+        for bt_t_id in range(bt):
+            for k_id in range(top_k):
+                e_id = t2e_routing_x2_smem[bt_sem_id, bt_t_id, k_id]
+                offset = expert_offsets_x2_smem[bt_sem_id, 1, e_id]
+                expert_offsets_x2_smem[bt_sem_id, 1, e_id] = offset + 1
+                pltpu.make_async_copy(
+                    src_ref=a2a_g_hbm.at[e_id, pl.ds(offset, 1)],
+                    dst_ref=a2a_g_acc_vmem.at[k_id, pl.ds(bt_t_id, 1)],
+                    sem=a2a_acc_sem,
+                ).start()
+        pltpu.make_async_copy(
+            src_ref=a2a_g_acc_vmem,
+            dst_ref=a2a_g_acc_vmem,
+            sem=a2a_acc_sem,
+        ).wait()
+        output = None
+        for k_id in range(top_k):
+            acc = a2a_g_acc_vmem[k_id].reshape(bt, hidden_size)
+            logits = broadcast_minor(top_k_logits_lst[k_id], acc.shape)
+            acc *= logits
+            if output is None:
+                output = acc
+            else:
+                output += acc
+        assert output is not None
+        return output.astype(output_hbm.dtype)
+    def start_send_bo(bt_id, priority=0):
+        bt_sem_id = bt_id % 2
+        b_output_sem = local_sems.at[bt_sem_id, 4]
+        pltpu.make_async_copy(
+            src_ref=b_output_x2_vmem.at[bt_sem_id],
+            dst_ref=output_hbm.at[pl.ds(bt_id * bt, bt)],
+            sem=b_output_sem,
+        ).start(priority=priority)
+    def wait_send_bo(bt_id):
+        is_valid = jnp.logical_and(0 <= bt_id, bt_id < num_bt)
+        sz = pl.multiple_of(lax.select(is_valid, bt, 0), bt)
+        bt_sem_id = (bt_id + 2) % 2
+        b_output_sem = local_sems.at[bt_sem_id, 4]
+        pltpu.make_async_copy(
+            src_ref=output_hbm.at[pl.ds(0, sz)],
+            dst_ref=output_hbm.at[pl.ds(0, sz)],
+            sem=b_output_sem,
+        ).wait()
+    ### ------- Kernel start ------- ###
+    start_fetch_b_gating(bt_id=0)
+    def run_per_bt(bt_id, e_sem_id):
+        bt_sem_id = bt_id % 2
+        next_bt_id = bt_id + 1
+        start_fetch_b_gating(next_bt_id)
+        wait_fetch_b_gating(bt_id)
+        b_gating = b_gating_x2_vmem[bt_sem_id]
+        b_gating_score = jax.nn.softmax(b_gating, axis=-1)
+        top_k_logits_lst, t2e_routing, expert_sizes, expert_starts = get_top_k(
+            b_gating_score, top_k, renormalize_topk_logits)
+        all_reduce_metadata(bt_sem_id, t2e_routing, expert_starts,
+                            expert_sizes)
+        sync_barrier()
+        # Start a2a scatter for first active expert.
+        start_a2a_scatter(bt_id=bt_id, e_sem_id=e_sem_id, local_e_id=0)
+        def run_per_expert(local_e_id, e_sem_id):
+            sync_barrier()
+            # Prefetch weights for CURRENT active expert.
+            # TODO(jevinjiang): It is hard to prefetch weights in previous iteration
+            # because the expert_ffn keeps overwriting the buffers. Triple buffering
+            # could resolve this but it takes more VMEM scratch. Need further
+            # experiment on this.
+            start_fetch_bw1(local_e_id, bw1_sem_id=0, bf_id=0, bd1_id=0)
+            start_fetch_bw3(local_e_id, bw3_sem_id=0, bf_id=0, bd3_id=0)
+            # Next ids.
+            next_e_sem_id = lax.select(e_sem_id == 0, 1, 0)
+            next_local_e_id = local_e_id + 1
+            # Start a2a scatter for NEXT active expert.
+            @pl.when(next_local_e_id < local_num_experts)
+            def _():
+                start_a2a_scatter(bt_id, next_e_sem_id, next_local_e_id)
+            # Wait a2a scatter for CURRENT active expert.
+            wait_a2a_scatter_recv(bt_id, e_sem_id, local_e_id)
+            # Perform FFN for CURRENT active expert.
+            expert_ffn(bt_id, e_sem_id, local_e_id)
+            # Start a2a gather to send back tokens for CURRENT active expert.
+            start_a2a_gather(bt_id, e_sem_id, local_e_id)
+            # A must-wait before next sync_barrier.
+            wait_a2a_scatter_send(bt_id, e_sem_id, local_e_id)
+            return next_e_sem_id
+        e_sem_id = lax.fori_loop(0,
+                                 local_num_experts,
+                                 run_per_expert,
+                                 e_sem_id,
+                                 unroll=False)
+        # Wait to receive a2a gather for ALL experts.
+        wait_a2a_gather_recv_all()
+        # Accumulate results for current batch.
+        output = bt_acc(bt_id, top_k_logits_lst)
+        # Make sure it is safe to overwrite output buffer.
+        wait_send_bo(bt_id=bt_id - 2)
+        b_output_x2_vmem[bt_sem_id] = output
+        start_send_bo(bt_id)
+        wait_a2a_gather_send(
+            bt_id,
+            e_sem_id=e_sem_id,
+            local_e_id=local_num_experts - 2,
+        )
+        wait_a2a_gather_send(
+            bt_id,
+            e_sem_id=lax.select(e_sem_id == 0, 1, 0),
+            local_e_id=local_num_experts - 1,
+        )
+        return e_sem_id
+    lax.fori_loop(0, num_bt, run_per_bt, 0, unroll=False)
+    wait_send_bo(bt_id=num_bt - 2)
+    wait_send_bo(bt_id=num_bt - 1)
+    ### ------- Kernel end ------- ###
+@functools.partial(
+    jax.jit,
+    static_argnames=[
+        "mesh",
+        "top_k",
+        "renormalize_topk_logits",
+        "act_fn",
+        "subc_quant_wsz",
+        "bt",
+        "bf",
+        "bd1",
+        "bd2",
+        "btc",
+        "bfc",
+        "bd1c",
+        "bd2c",
+        "ep_axis_name",
+    ],
+)
+def fused_ep_moe(
+    mesh: jax.sharding.Mesh,
+    tokens: jax.Array,  # (num_tokens, hidden_size)
+    w1: jax.Array,  # (num_experts, 2, hidden_size, intermediate_size)
+    w2: jax.Array,  # (num_experts, intermediate_size, hidden_size)
+    gating_output: jax.Array,  # (num_tokens, num_experts)
+    top_k: int,
+    *,
+    renormalize_topk_logits: bool = False,
+    act_fn: str = "silu",
+    subc_quant_wsz: int | None = None,
+    w1_scale: (
+        jax.Array | None
+    ) = None,  # F32(num_experts, 2, hidden_size // subc_quant_wsz, 1, intermediate_size)
+    w2_scale: (
+        jax.Array | None
+    ) = None,  # F32(num_experts, intermediate_size // subc_quant_wsz, 1, hidden_size)
+    b1: jax.Array | None = None,  # F32(num_experts, 2, 1, intermediate_size)
+    b2: jax.Array | None = None,  # F32(num_experts, 1, hidden_size)
+    # Kernel tuning parameters.
+    bt: int,
+    bf: int,
+    bd1: int,
+    bd2: int,
+    btc: int,
+    bfc: int,
+    bd1c: int,
+    bd2c: int,
+    ep_axis_name: str = "model",
+):
+    # TODO(jevinjiang): move all these assertions to validation function.
+    if len(mesh.shape) != 2:
+        raise NotImplementedError("Only 2D mesh is supported.")
+    for axis_name in mesh.axis_names:
+        if axis_name == ep_axis_name:
+            continue
+        if mesh.shape[axis_name] != 1:
+            raise NotImplementedError(
+                f"Expected all non-ep axis to have size 1 in {mesh.shape=}")
+    ep_size = mesh.shape[ep_axis_name]
+    num_devices = ep_size
+    num_tokens, hidden_size = tokens.shape
+    num_experts, intermediate_size, _ = w2.shape
+    if w1.shape != (num_experts, 2, hidden_size, intermediate_size):
+        raise ValueError(
+            f"Expected {w1.shape=} to be"
+            f" {(num_experts, 2, hidden_size, intermediate_size)}.")
+    if w2.shape != (num_experts, intermediate_size, hidden_size):
+        raise ValueError(f"Expected {w2.shape=} to be"
+                         f" {(num_experts, intermediate_size, hidden_size)}.")
+    if gating_output.shape != (num_tokens, num_experts):
+        raise ValueError(
+            f"Expected {gating_output.shape=} to be {(num_tokens, num_experts)}."
+        )
+    if not (0 < top_k <= num_experts):
+        raise ValueError(
+            f"Expected {top_k=} to be in range (0, {num_experts=}].")
+    if hidden_size % 128 != 0 or intermediate_size % 128 != 0:
+        raise ValueError(
+            f"Expected {hidden_size=} and {intermediate_size=} to be aligned to"
+            " 128. Did you pad them with zeros outside the kernel?")
+    if num_tokens % ep_size != 0:
+        raise ValueError(
+            f"Expected {num_tokens=} to be aligned to {ep_size=}.")
+    if num_experts % ep_size != 0:
+        raise ValueError(
+            f"Expected {num_experts=} to be aligned to {ep_size=}.")
+    local_num_tokens = num_tokens // ep_size
+    # local_num_experts = num_experts // ep_size
+    padded_num_experts = align_to(num_experts, 128)
+    padded_top_k = align_to(top_k, 128)
+    t_dtype = tokens.dtype
+    t_packing = get_dtype_packing(t_dtype)
+    # Override bt
+    if local_num_tokens <= t_packing * 8:
+        bt = local_num_tokens
+        btc = bt
+    bt = min(local_num_tokens, bt)
+    # The worst case is that all devices send bt to one device.
+    btc = min(bt, btc, bt * num_devices)
+    if local_num_tokens % t_packing != 0:
+        raise ValueError(
+            f"Expected {local_num_tokens=} to be aligned to {t_packing=}.")
+    if bt % t_packing != 0:
+        raise ValueError(f"Expected {bt=} to be aligned to {t_packing=}.")
+    if local_num_tokens % bt != 0:
+        raise ValueError(
+            f"Expected {local_num_tokens=} to be aligned to {bt=}.")
+    if subc_quant_wsz is not None:
+        if subc_quant_wsz <= 0:
+            raise ValueError(f"Expected {subc_quant_wsz=} to be non-negative.")
+        if subc_quant_wsz % 256 != 0:
+            raise ValueError(
+                "Expected {subc_quant_wsz=} to be aligned to 256.")
+        if hidden_size % subc_quant_wsz != 0:
+            raise ValueError(
+                f"Expected {hidden_size=} to be aligned to {subc_quant_wsz=}.")
+        if intermediate_size % subc_quant_wsz != 0:
+            raise ValueError(
+                f"Expected {intermediate_size=} to be aligned to {subc_quant_wsz=}."
+            )
+        # We force compute size of contracting dim to be subc_quant_wsz. So we can
+        # apply same scale after matmul and accumulation.
+        bd1c = subc_quant_wsz * t_packing
+        bfc = subc_quant_wsz
+    if bfc % 128 != 0:
+        raise ValueError(f"Expected {bfc=} to be aligned to 128.")
+    if bd1c % (t_packing * 128) != 0:
+        raise ValueError(
+            f"Expected {bd1c=} to be aligned to {t_packing * 128}.")
+    if bd2c % (t_packing * 128) != 0:
+        raise ValueError(
+            f"Expected {bd2c=} to be aligned to {t_packing * 128}.")
+    if bf % bfc != 0:
+        raise ValueError(f"Expected {bf=} to be aligned to {bfc=}.")
+    if bd1 % bd1c != 0:
+        raise ValueError(f"Expected {bd1=} to be aligned to {bd1c=}.")
+    if bd2 % bd2c != 0:
+        raise ValueError(f"Expected {bd2=} to be aligned to {bd2c=}.")
+    if hidden_size % bd1 != 0 or hidden_size % bd2 != 0:
+        raise ValueError(
+            f"Expected {hidden_size=} to be aligned to {bd1=} and {bd2=}.")
+    if intermediate_size % bf != 0:
+        raise ValueError(
+            f"Expected {intermediate_size=} to be aligned to {bf=}.")
+    # Note: we should dump scale as the kernel expected shape in the
+    # checkpoint offline or reshape right after weight loading.
+    if w1_scale is not None:
+        expected_w1_scale_shape = (
+            num_experts,
+            2,
+            hidden_size // subc_quant_wsz,
+            1,
+            intermediate_size,
+        )
+        if w1_scale.shape != expected_w1_scale_shape:
+            raise ValueError(
+                f"Expected {w1_scale.shape=} to be {expected_w1_scale_shape}.")
+        if w1_scale.dtype != jnp.float32:
+            w1_scale = w1_scale.astype(jnp.float32)
+    if w2_scale is not None:
+        expected_w2_scale_shape = (
+            num_experts,
+            intermediate_size // subc_quant_wsz,
+            1,
+            hidden_size,
+        )
+        if w2_scale.shape != expected_w2_scale_shape:
+            raise ValueError(
+                f"Expected {w2_scale.shape=} to be {expected_w2_scale_shape}.")
+        if w2_scale.dtype != jnp.float32:
+            w2_scale = w2_scale.astype(jnp.float32)
+    if b1 is not None:
+        expected_b1_shape = (num_experts, 2, 1, intermediate_size)
+        if b1.shape != expected_b1_shape:
+            raise ValueError(
+                f"Expected {b1.shape=} to be {expected_b1_shape}.")
+        if b1.dtype != jnp.float32:
+            b1 = b1.astype(jnp.float32)
+    if b2 is not None:
+        expected_b2_shape = (num_experts, 1, hidden_size)
+        if b2.shape != expected_b2_shape:
+            raise ValueError(
+                f"Expected {b2.shape=} to be {expected_b2_shape}.")
+        if b2.dtype != jnp.float32:
+            b2 = b2.astype(jnp.float32)
+    # Prepare inputs for the kernel.
+    if padded_num_experts != gating_output.shape[-1]:
+        gating_output = jnp.pad(
+            gating_output,
+            ((0, 0), (0, padded_num_experts - gating_output.shape[-1])),
+            constant_values=-jnp.inf,
+        )
+    tokens = tokens.reshape(-1, t_packing, hidden_size // t_packing)
+    hbm_block_spec = pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM)
+    renorm_str = "-renorm_k" if renormalize_topk_logits else ""
+    scope_name = f"fused-moe-k_{top_k}{renorm_str}-bt_{bt}_{btc}-bf_{bf}_{bfc}-bd1_{bd1}_{bd1c}-bd2_{bd2}_{bd2c}"
+    fused_moe = pl.pallas_call(
+        functools.partial(
+            _fused_ep_moe_kernel,
+            top_k=top_k,
+            renormalize_topk_logits=renormalize_topk_logits,
+            ep_axis_name=ep_axis_name,
+            act_fn=act_fn,
+            subc_quant_wsz=subc_quant_wsz,
+            bt=bt,
+            bf=bf,
+            bd1=bd1,
+            bd2=bd2,
+            btc=btc,
+            bfc=bfc,
+            bd1c=bd1c,
+            bd2c=bd2c,
+        ),
+        out_shape=jax.ShapeDtypeStruct((local_num_tokens, hidden_size),
+                                       t_dtype),
+        grid_spec=pltpu.PrefetchScalarGridSpec(
+            num_scalar_prefetch=0,
+            in_specs=[
+                hbm_block_spec,  # tokens_hbm
+                hbm_block_spec,  # w1_hbm
+                hbm_block_spec,  # w2_hbm
+                None if w1_scale is None else hbm_block_spec,  # w1_scale_hbm
+                None if w2_scale is None else hbm_block_spec,  # w2_scale_hbm
+                None if b1 is None else hbm_block_spec,  # b1_hbm
+                None if b2 is None else hbm_block_spec,  # b2_hbm
+                hbm_block_spec,  # gating_output_hbm
+                hbm_block_spec,  # a2a_g_hbm
+            ],
+            out_specs=pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+            scratch_shapes=([
+                # t2e_routing_x2_smem
+                pltpu.SMEM((2, bt, padded_top_k), jnp.int32),
+                # d2e_count_x2_smem
+                pltpu.SMEM((2, num_devices, 1, padded_num_experts), jnp.int32),
+                # expert_offsets_x2_smem
+                pltpu.SMEM((2, 2, padded_num_experts), jnp.int32),
+                # expert_starts_x2_smem
+                pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
+                # expert_sizes_x2_smem
+                pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
+                # a2a_s_sends_x2_smem
+                pltpu.SMEM((2, ), jnp.int32),
+                # a2a_s_x2_vmem
+                pltpu.VMEM(
+                    (
+                        2,
+                        bt * num_devices,
+                        t_packing,
+                        hidden_size // t_packing,
+                    ),
+                    t_dtype,
+                ),
+                # a2a_s_acc_x2_vmem
+                pltpu.VMEM(
+                    (
+                        2,
+                        bt * num_devices,
+                        t_packing,
+                        hidden_size // t_packing,
+                    ),
+                    t_dtype,
+                ),
+                # a2a_g_acc_vmem
+                pltpu.VMEM((top_k, bt, t_packing, hidden_size // t_packing),
+                           t_dtype),
+                # b_gating_x2_vmem
+                pltpu.VMEM((2, bt, padded_num_experts), t_dtype),
+                # b_output_x2_vmem
+                pltpu.VMEM((2, bt, hidden_size), t_dtype),
+                # b_w1_x2_vmem
+                pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
+                # b_w3_x2_vmem
+                pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
+                # b_w2_x2_vmem
+                pltpu.VMEM((2, t_packing, bf, bd2 // t_packing), w2.dtype),
+                # b_w1_scale_x2_vmem
+                (None if w1_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bd1 // t_packing // subc_quant_wsz,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_w3_scale_x2_vmem
+                (None if w1_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bd1 // t_packing // subc_quant_wsz,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_w2_scale_x2_vmem
+                (None if w2_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bf // subc_quant_wsz,
+                        1,
+                        bd2 // t_packing,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b1_x2_vmem
+                (None if b1 is None else pltpu.VMEM(
+                    (
+                        2,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b3_x2_vmem
+                (None if b1 is None else pltpu.VMEM(
+                    (
+                        2,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b2_x2_vmem
+                (None if b2 is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        1,
+                        bd2 // t_packing,
+                    ),
+                    jnp.float32,
+                )),
+                # b_acc_vmem
+                pltpu.VMEM((bt * num_devices, 1, bf * 2), jnp.float32),
+                # local_sems
+                pltpu.SemaphoreType.DMA((2, 5)),
+                # send_sems
+                pltpu.SemaphoreType.DMA((2, )),
+                # recv_sems
+                pltpu.SemaphoreType.DMA((2, )),
+                # a2a_gather_sem
+                pltpu.SemaphoreType.DMA,
+                # a2a_acc_sem
+                pltpu.SemaphoreType.DMA,
+            ]),
+        ),
+        compiler_params=pltpu.CompilerParams(
+            collective_id=0,
+            vmem_limit_bytes=100 * 1024 * 1024,
+        ),
+        name=scope_name,
+    )
+    @jax.jit
+    @jax.shard_map(
+        mesh=mesh,
+        in_specs=(
+            P(ep_axis_name),  # tokens_hbm
+            P(ep_axis_name),  # w1_hbm
+            P(ep_axis_name),  # w2_hbm
+            None if w1_scale is None else P(ep_axis_name),  # w1_scale_hbm
+            None if w2_scale is None else P(ep_axis_name),  # w2_scale_hbm
+            None if b1 is None else P(ep_axis_name),  # b1_hbm
+            None if b2 is None else P(ep_axis_name),  # b2_hbm
+            P(ep_axis_name),  # gating_output_hbm
+            P(),  # a2a_g_hbm
+        ),
+        out_specs=P(ep_axis_name),
+        check_vma=False,
+    )
+    def kernel(
+        tokens,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        b1,
+        b2,
+        gating_output,
+        a2a_g_hbm_scratch,
+    ):
+        return fused_moe(
+            pltpu.with_memory_space_constraint(tokens,
+                                               pltpu.HBM),  # tokens_hbm
+            pltpu.with_memory_space_constraint(w1, pltpu.HBM),  # w1_hbm
+            pltpu.with_memory_space_constraint(w2, pltpu.HBM),  # w2_hbm
+            (None if w1_scale is None else pltpu.with_memory_space_constraint(
+                w1_scale, pltpu.HBM)),  # w1_scale_hbm
+            (None if w2_scale is None else pltpu.with_memory_space_constraint(
+                w2_scale, pltpu.HBM)),  # w2_scale_hbm
+            (None if b1 is None else pltpu.with_memory_space_constraint(
+                b1, pltpu.HBM)),  # b1_hbm
+            (None if b2 is None else pltpu.with_memory_space_constraint(
+                b2, pltpu.HBM)),  # b2_hbm
+            pltpu.with_memory_space_constraint(gating_output,
+                                               pltpu.HBM),  # gating_output_hbm
+            pltpu.with_memory_space_constraint(a2a_g_hbm_scratch,
+                                               pltpu.HBM),  # a2a_g_hbm
+        )
+    a2a_g_hbm_scratch = pl.empty(
+        (num_experts, bt, t_packing, hidden_size // t_packing), t_dtype)
+    return kernel(
+        tokens,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        b1,
+        b2,
+        gating_output,
+        a2a_g_hbm_scratch,
+    )