PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (59) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -1
tests/lora/test_lora_perf.py +53 -0
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/distributed/tpu_connector.py +1 -1
tpu_inference/envs.py +92 -8
tpu_inference/executors/ray_distributed_executor.py +5 -1
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +82 -32
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +146 -85
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/models/common/model_loader.py +78 -22
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama_eagle3.py +4 -5
tpu_inference/models/jax/qwen2_5_vl.py +161 -47
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +203 -155
tpu_inference/models/vllm/vllm_model_wrapper.py +11 -5
tpu_inference/platforms/tpu_platform.py +29 -48
tpu_inference/runner/compilation_manager.py +112 -46
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +40 -31
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +94 -51
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -22
tpu_inference/utils.py +41 -14
tpu_inference/worker/tpu_worker.py +43 -45
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +8 -9
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +59 -58
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/kernels/fused_moe/v1/kernel.py CHANGED Viewed

@@ -7,7 +7,6 @@ import jax.numpy as jnp
 from jax import lax
 from jax._src import dtypes
 from jax.experimental import pallas as pl
-from jax.experimental import shard_map
 from jax.experimental.pallas import tpu as pltpu
 P = jax.sharding.PartitionSpec
@@ -20,7 +19,8 @@ def align_to(x, a):
 def get_dtype_packing(dtype):
-    bits = dtypes.bit_width(dtype)
+    bits = (dtypes.bit_width(dtype)
+            if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
     return 32 // bits
@@ -35,13 +35,50 @@ def broadcast_minor(src, shape):
                            axis=-1)[..., :shape[-1]]
+def swigluoai(gate: jax.Array,
+              up: jax.Array,
+              *,
+              alpha: float = 1.702,
+              limit: float = 7.0) -> jax.Array:
+    """Activation used in some models such as GPT-OSS."""
+    gate = jnp.clip(gate, a_max=limit)
+    up = jnp.clip(up, a_min=-limit, a_max=limit)
+    glu = gate * jax.nn.sigmoid(alpha * gate)
+    return (up + 1.0) * glu
+def activation_fn(acc1, acc3, act_fn):
+    if act_fn == "silu":
+        return jax.nn.silu(acc1) * acc3
+    elif act_fn == "gelu":
+        return jax.nn.gelu(acc1) * acc3
+    elif act_fn == "swigluoai":
+        return swigluoai(acc1, acc3)
+    else:
+        raise RuntimeError(f"Unsupported activation function: {act_fn}")
 def ref_moe(
-    tokens: jax.Array,  # (num_tokens, hidden_size)
-    w1: jax.Array,  # (num_experts, 2, hidden_size, intermediate_size)
-    w2: jax.Array,  # (num_experts, intermediate_size, hidden_size)
-    gating_output: jax.Array,  # (num_tokens, num_experts)
-    top_k: int,
-    activation="silu",
+        tokens: jax.Array,  # (num_tokens, hidden_size)
+        w1: jax.Array,  # (num_experts, 2, hidden_size, intermediate_size)
+        w2: jax.Array,  # (num_experts, intermediate_size, hidden_size)
+        gating_output: jax.Array,  # (num_tokens, num_experts)
+        top_k: int,
+        *,
+        renormalize_topk_logits: bool = False,
+        act_fn: str = "silu",
+        subc_quant_wsz: int | None = None,
+        w1_scale:
+    (
+        jax.Array | None
+    ) = None,  # F32(num_experts, 2, hidden_size //subc_quant_wsz, 1, intermediate_size)
+        w2_scale:
+    (
+        jax.Array | None
+    ) = None,  # F32(num_experts, intermediate_size // subc_quant_wsz, 1, hidden_size)
+        b1: jax.Array
+    | None = None,  # F32(num_experts, 2, 1, intermediate_size)
+        b2: jax.Array | None = None,  # F32(num_experts, 1, hidden_size)
 ):
     n_tokens = tokens.shape[0]  # num_tokens
@@ -53,11 +90,16 @@ def ref_moe(
     top_k_logits, top_k_indices = lax.top_k(
         gating_logits, top_k)  # [num_tokens, top_k], [num_tokens, top_k]
+    if renormalize_topk_logits:
+        top_k_logits = top_k_logits / jnp.sum(
+            top_k_logits, axis=-1, keepdims=True)
     t_outputs = []
+    hidden_size, intermediate_size = w1.shape[-2:]
     # Process each token individually
     for i in range(n_tokens):
-        curr_token = jnp.expand_dims(tokens[i], axis=0)  # [1, d_model]
+        curr_token = jnp.expand_dims(tokens[i], axis=0)  # [1, hidden_size]
         assigned_expert_ids = top_k_indices[
             i]  # [top_k] - indices of selected experts for token i
         tok_expert_act = []
@@ -65,10 +107,24 @@ def ref_moe(
         # Process each selected expert for the current token
         for expert_id in assigned_expert_ids:
             # Get expert weights
+            expert_w1 = w1[expert_id, 0].astype(jnp.float32)
+            expert_w3 = w1[expert_id, 1].astype(jnp.float32)
+            if w1_scale is not None:
+                expert_w1 *= jnp.repeat(w1_scale[expert_id, 0, :, 0],
+                                        subc_quant_wsz,
+                                        axis=0)[:hidden_size]
+                expert_w3 *= jnp.repeat(w1_scale[expert_id, 1, :, 0],
+                                        subc_quant_wsz,
+                                        axis=0)[:hidden_size]
             expert_weight_1 = jnp.concat(
-                [w1[expert_id, 0], w1[expert_id, 1]],
-                axis=-1)  # [d_model, 2 * intermediate_size]
-            expert_weight_2 = w2[expert_id]  # [intermediate_size, d_model]
+                [expert_w1, expert_w3],
+                axis=-1)  # [hidden_size, 2 * intermediate_size]
+            expert_weight_2 = w2[expert_id].astype(
+                jnp.float32)  # [intermediate_size, hidden_size]
+            if w2_scale is not None:
+                expert_weight_2 *= jnp.repeat(w2_scale[expert_id, :, 0],
+                                              subc_quant_wsz,
+                                              axis=0)[:intermediate_size]
             # First linear layer with SwiGLU activation
             gmm_1_out = curr_token @ expert_weight_1  # [1, 2 * intermediate_size]
@@ -77,37 +133,34 @@ def ref_moe(
             gmm1_w1_proj, gmm1_w3_proj = jnp.split(
                 gmm_1_out, 2,
                 axis=-1)  # [1, intermediate_size], [1, intermediate_size]
+            if b1 is not None:
+                gmm1_w1_proj += b1[expert_id:expert_id + 1, 0, 0]
+                gmm1_w3_proj += b1[expert_id:expert_id + 1, 1, 0]
             # Apply gated activation: activation(gate) * up
-            if activation == "silu":
-                act = jax.nn.silu(
-                    gmm1_w1_proj) * gmm1_w3_proj  # [1, intermediate_size]
-            elif activation == "gelu":
-                act = jax.nn.gelu(
-                    gmm1_w1_proj) * gmm1_w3_proj  # [1, intermediate_size]
-            else:
-                raise ValueError(
-                    f"Unsupported activation: {activation}. Use 'silu' or 'gelu'."
-                )
+            act = activation_fn(gmm1_w1_proj, gmm1_w3_proj, act_fn)
             # Second linear layer (down projection)
-            gmm_2_out = act @ expert_weight_2  # [1, d_model]
+            gmm_2_out = act @ expert_weight_2  # [1, hidden_size]
+            if b2 is not None:
+                gmm_2_out += b2[expert_id:expert_id + 1, 0]
             tok_expert_act.append(gmm_2_out)
         # Combine outputs from all selected experts
         experts_act = jnp.concatenate(tok_expert_act,
-                                      axis=0)  # [top_k, d_model]
+                                      axis=0)  # [top_k, hidden_size]
         # Weighted sum using top-k gating weights
         top_k_weights = top_k_logits[i]  # [top_k]
         top_k_weights = jnp.expand_dims(top_k_weights, axis=1)  # [top_k, 1]
         weighted_output = jnp.sum(experts_act * top_k_weights,
                                   axis=0,
-                                  keepdims=True)  # [1, d_model]
+                                  keepdims=True)  # [1, hidden_size]
-        t_outputs.append(weighted_output)
+        t_outputs.append(weighted_output.astype(tokens.dtype))
-    return jnp.concatenate(t_outputs, axis=0)  # [num_tokens, d_model]
+    return jnp.concatenate(t_outputs,
+                           axis=0)  # [actual_num_tokens, hidden_size]
 def _fused_ep_moe_kernel(
@@ -115,12 +168,19 @@ def _fused_ep_moe_kernel(
         tokens_hbm,  # (local_num_tokens, t_packing, hidden_size // t_packing)
         w1_hbm,  # (local_num_experts, 2, hidden_size, intermediate_size)
         w2_hbm,  # (local_num_experts, intermediate_size, hidden_size)
+        # TODO(jevinjiang): We choose F32 scale for easier slicing. The extra
+        # latency should be hidden in the pipeline overlaping. But is there a better
+        # way to do this?
+    w1_scale_hbm,  # None | F32(local_num_experts, 2, cdiv(hidden_size, subc_quant_wsz), 1, intermediate_size)
+        w2_scale_hbm,  # None | F32(local_num_experts, cdiv(intermediate_size, subc_quant_wsz), 1, hidden_size)
+        b1_hbm,  # None | F32(local_num_experts, 2, 1, intermediate_size)
+        b2_hbm,  # None | F32(local_num_experts, 1, hidden_size)
         gating_hbm,  # (local_num_tokens, padded_num_experts)
         a2a_g_hbm,  # (num_experts, bt, t_packing, hidden_size // t_packing)
         # Output
     output_hbm,  # (local_num_tokens, hidden_size)
         # Scratch
-    t2e_routing_x2_smem,  # <bt_sem_id> (2, bt, padded_num_experts)
+    t2e_routing_x2_smem,  # <bt_sem_id> (2, bt, padded_top_k)
         d2e_count_x2_smem,  # <bt_sem_id> (2, num_devices, 1, padded_num_experts)
         expert_offsets_x2_smem,  # <bt_sem_id> (2, 2, padded_num_experts): for a2a_s and a2a_g
         expert_starts_x2_smem,  # <bt_sem_id> (2, 1, padded_num_experts)
@@ -136,6 +196,12 @@ def _fused_ep_moe_kernel(
         b_w1_x2_vmem,  # <bw_sem_id> (2, t_packing, bd1 // t_packing, bf)
         b_w3_x2_vmem,  # <bw_sem_id> (2, t_packing, bd1 // t_packing, bf)
         b_w2_x2_vmem,  # <bw_sem_id> (2, t_packing, bf, bd2 // t_packing)
+        b_w1_scale_x2_vmem,  # None | <bw_sem_id> (2, t_packing, bd1 // t_packing // subc_quant_wsz, 1, bf)
+        b_w3_scale_x2_vmem,  # None | <bw_sem_id> (2, t_packing, bd1 // t_packing // subc_quant_wsz, 1, bf)
+        b_w2_scale_x2_vmem,  # None | <bw_sem_id> (2, t_packing, bf // subc_quant_wsz, 1, bd2 // t_packing)
+        b_b1_x2_vmem,  # None | <bw_sem_id> (2, 1, bf)
+        b_b3_x2_vmem,  # None | <bw_sem_id> (2, 1, bf)
+        b_b2_x2_vmem,  # None | <bw_sem_id> (2, t_packing, 1, bd2 // t_packing)
         b_acc_vmem,  # F32(bt * num_devices, 1, bf * 2)
         ### Semaphores:
     local_sems,  # (2, 5): 2 x [b_gating_sem, b_w1_sem, b_w2_sem, b_w3_sem, b_output_sem]
@@ -145,7 +211,10 @@ def _fused_ep_moe_kernel(
         a2a_acc_sem,
         *,
         top_k: int,
+        renormalize_topk_logits: bool,
         ep_axis_name: str,
+        act_fn: str,
+        subc_quant_wsz: int | None = None,
         # Kernel tuning params.
         bt: int,  # Block size of local_num_tokens.
         bf: int,  # Block size of intermediate_size.
@@ -160,34 +229,58 @@ def _fused_ep_moe_kernel(
     num_devices = lax.axis_size(ep_axis_name)
     local_num_tokens = tokens_hbm.shape[0]
     local_num_experts, intermediate_size, hidden_size = w2_hbm.shape
-    # num_experts = local_num_experts * num_devices
-    # padded_num_experts = expert_starts_x2_smem.shape[-1]
     right_id = (my_id + 1) % num_devices
+    num_experts = a2a_g_hbm.shape[0]
+    padded_num_experts = d2e_count_x2_smem.shape[-1]
+    padded_top_k = t2e_routing_x2_smem.shape[-1]
+    assert padded_num_experts == align_to(num_experts, 128)
+    assert padded_top_k == align_to(top_k, 128)
     t_dtype = tokens_hbm.dtype
     t_packing = get_dtype_packing(t_dtype)
     t_bitwidth = 32 // t_packing
     assert a2a_g_hbm.dtype == t_dtype
-    assert w1_hbm.dtype == t_dtype
-    assert w2_hbm.dtype == t_dtype
+    assert w1_hbm.dtype == w2_hbm.dtype
-    h_per_packing = hidden_size // t_packing
-    assert tokens_hbm.shape[-1] == h_per_packing
-    bd1_per_packing = bd1 // t_packing
-    bd2_per_packing = bd2 // t_packing
-    bd1c_per_packing = bd1c // t_packing
-    bd2c_per_packing = bd2c // t_packing
+    assert bd1 % bd1c == 0
+    assert bd2 % bd2c == 0
+    assert bf % bfc == 0
+    assert hidden_size % t_packing == 0
+    assert bd1 % t_packing == 0
+    assert bd2 % t_packing == 0
+    assert bd1c % t_packing == 0
+    assert bd2c % t_packing == 0
+    h_per_t_packing = hidden_size // t_packing
+    assert tokens_hbm.shape[-1] == h_per_t_packing
+    bd1_per_t_packing = bd1 // t_packing
+    bd2_per_t_packing = bd2 // t_packing
+    bd1c_per_t_packing = bd1c // t_packing
+    bd2c_per_t_packing = bd2c // t_packing
+    if subc_quant_wsz is not None:
+        assert subc_quant_wsz % 256 == 0
+        assert bd1c_per_t_packing == subc_quant_wsz
+        assert bfc == subc_quant_wsz
+        assert bd1 % subc_quant_wsz == 0
+        assert bf % subc_quant_wsz == 0
+        assert bd1_per_t_packing % subc_quant_wsz == 0
+        assert h_per_t_packing % subc_quant_wsz == 0
     num_bt = cdiv(local_num_tokens, bt)
     num_bf = cdiv(intermediate_size, bf)
     num_bd1 = cdiv(hidden_size, bd1)
     num_bd2 = cdiv(hidden_size, bd2)
+    def get_mesh_device_id(ep_rank):
+        dp_rank = jax.lax.axis_index("data")
+        return (dp_rank, ep_rank)
     def sync_barrier():
         barrier_sem = pltpu.get_barrier_semaphore()
         pltpu.semaphore_signal(
             barrier_sem,
-            device_id=(0, right_id),
+            device_id=get_mesh_device_id(right_id),
             device_id_type=pltpu.DeviceIdType.MESH,
         )
         pltpu.semaphore_wait(barrier_sem, 1)
@@ -212,30 +305,44 @@ def _fused_ep_moe_kernel(
             sem=b_gating_sem,
         ).wait()
-    def get_top_k(input, top_k):
+    def get_top_k(input, top_k, renormalize_topk_logits):
         assert len(input.shape) == 2, input.shape
         input = input.astype(jnp.float32)
+        padded_k_shape = (input.shape[0], padded_top_k)
         top_k_logits_lst = []
         top_k_indices_lst = []
         t2e = jnp.zeros(input.shape, dtype=jnp.int32)
-        t2e_routing = jnp.zeros(input.shape, dtype=jnp.int32)
+        t2e_routing = jnp.zeros(padded_k_shape, dtype=jnp.int32)
         iota = jax.lax.broadcasted_iota(jnp.int32, input.shape, 1)
+        padded_k_iota = jax.lax.broadcasted_iota(jnp.int32, padded_k_shape, 1)
+        top_k_logits_sum = jnp.zeros(padded_k_shape, jnp.float32)
         for k_id in range(top_k):
-            # TODO(jevinjiang): return both top_k values and indices in op in Mosaic
+            # TODO(jevinjiang): return both top_k values and indices in Mosaic
             top_k_logits = jnp.broadcast_to(
-                jnp.max(input, axis=1, keepdims=True),
-                (input.shape[0], 128)).astype(input.dtype)
+                jnp.max(input[:, :num_experts], axis=1, keepdims=True),
+                padded_k_shape,
+            ).astype(input.dtype)
             top_k_logits_lst.append(top_k_logits)
+            if renormalize_topk_logits:
+                top_k_logits_sum += top_k_logits
             # TODO(jevinjiang): support bf16 argmax in Mosaic
             top_k_indices = jnp.broadcast_to(
-                jnp.argmax(input, axis=1, keepdims=True), input.shape)
+                jnp.argmax(input[:, :num_experts], axis=1, keepdims=True),
+                padded_k_shape,
+            )
             top_k_indices_lst.append(top_k_indices)
-            t2e_routing = jnp.where(iota == k_id, top_k_indices, t2e_routing)
-            mask = iota == top_k_indices
+            t2e_routing = jnp.where(padded_k_iota == k_id, top_k_indices,
+                                    t2e_routing)
+            mask = iota == broadcast_minor(top_k_indices, input.shape)
             t2e += mask.astype(jnp.int32)
             if k_id != top_k - 1:
                 input = jnp.where(mask, -jnp.inf, input)
+        if renormalize_topk_logits:
+            for k_id in range(top_k):
+                top_k_logits_lst[k_id] /= top_k_logits_sum
         expert_sizes = jnp.sum(t2e, axis=0, keepdims=True)
         expert_starts = jnp.zeros_like(expert_sizes)
         return top_k_logits_lst, t2e_routing, expert_sizes, expert_starts
@@ -277,7 +384,7 @@ def _fused_ep_moe_kernel(
                     dst_ref=d2e_count_vmem.at[row_id],
                     send_sem=send_sem,
                     recv_sem=recv_sem,
-                    device_id=(0, right_id),
+                    device_id=get_mesh_device_id(right_id),
                     device_id_type=pltpu.DeviceIdType.MESH,
                 ).wait()
                 row_id = (row_id + num_devices - 1) % num_devices
@@ -359,10 +466,8 @@ def _fused_ep_moe_kernel(
                                              pl.ds(start, remote_sz)],
                     send_sem=send_sems.at[e_sem_id],
                     recv_sem=recv_sems.at[e_sem_id],
-                    device_id=(
-                        0,
-                        recv_id,
-                    ),
+                    device_id=get_mesh_device_id(recv_id),
+                    device_id_type=pltpu.DeviceIdType.MESH,
                 ).start()
         a2a_s_sends_x2_smem[e_sem_id] = send_sz
@@ -406,7 +511,8 @@ def _fused_ep_moe_kernel(
                 dst_ref=a2a_g_hbm.at[my_e_id, pl.ds(0, remote_sz)],
                 send_sem=send_sems.at[e_sem_id],
                 recv_sem=a2a_gather_sem,
-                device_id=(0, recv_id),
+                device_id=get_mesh_device_id(recv_id),
+                device_id_type=pltpu.DeviceIdType.MESH,
             ).start()
             start += sz
@@ -435,68 +541,173 @@ def _fused_ep_moe_kernel(
     def start_fetch_bw1(local_e_id, bw1_sem_id, bf_id, bd1_id):
         for p in range(t_packing):
-            offset = p * h_per_packing + bd1_id * bd1_per_packing
+            offset = p * h_per_t_packing + bd1_id * bd1_per_t_packing
             pltpu.make_async_copy(
                 src_ref=w1_hbm.at[
                     local_e_id,
                     0,
-                    pl.ds(offset, bd1_per_packing),
+                    pl.ds(offset, bd1_per_t_packing),
                     pl.ds(bf_id * bf, bf),
                 ],
                 dst_ref=b_w1_x2_vmem.at[bw1_sem_id, p],
                 sem=local_sems.at[bw1_sem_id, 1],
             ).start()
+            if w1_scale_hbm is not None:
+                assert subc_quant_wsz is not None
+                pltpu.make_async_copy(
+                    src_ref=w1_scale_hbm.at[
+                        local_e_id,
+                        0,
+                        pl.ds(
+                            offset // subc_quant_wsz,
+                            bd1_per_t_packing // subc_quant_wsz,
+                        ),
+                        pl.ds(0, 1),
+                        pl.ds(bf_id * bf, bf),
+                    ],
+                    dst_ref=b_w1_scale_x2_vmem.at[bw1_sem_id, p],
+                    sem=local_sems.at[bw1_sem_id, 1],
+                ).start()
+        if b1_hbm is not None and bd1_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b1_hbm.at[local_e_id, 0,
+                                  pl.ds(0, 1),
+                                  pl.ds(bf_id * bf, bf)],
+                dst_ref=b_b1_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).start()
     def start_fetch_bw2(local_e_id, bw2_sem_id, bf_id, bd2_id):
         for p in range(t_packing):
-            offset = p * h_per_packing + bd2_id * bd2_per_packing
+            offset = p * h_per_t_packing + bd2_id * bd2_per_t_packing
             pltpu.make_async_copy(
                 src_ref=w2_hbm.at[
                     local_e_id,
                     pl.ds(bf_id * bf, bf),
-                    pl.ds(offset, bd2_per_packing),
+                    pl.ds(offset, bd2_per_t_packing),
                 ],
                 dst_ref=b_w2_x2_vmem.at[bw2_sem_id, p],
                 sem=local_sems.at[bw2_sem_id, 2],
             ).start()
+            if w2_scale_hbm is not None:
+                assert subc_quant_wsz is not None
+                pltpu.make_async_copy(
+                    src_ref=w2_scale_hbm.at[
+                        local_e_id,
+                        pl.ds(bf_id * bf // subc_quant_wsz, bf //
+                              subc_quant_wsz),
+                        pl.ds(0, 1),
+                        pl.ds(offset, bd2_per_t_packing),
+                    ],
+                    dst_ref=b_w2_scale_x2_vmem.at[bw2_sem_id, p],
+                    sem=local_sems.at[bw2_sem_id, 2],
+                ).start()
+            if b2_hbm is not None and bf_id == 0:
+                pltpu.make_async_copy(
+                    src_ref=b2_hbm.at[local_e_id,
+                                      pl.ds(0, 1),
+                                      pl.ds(offset, bd2_per_t_packing)],
+                    dst_ref=b_b2_x2_vmem.at[bd2_id % 2, p],
+                    sem=local_sems.at[bw2_sem_id, 2],
+                ).start()
     def start_fetch_bw3(local_e_id, bw3_sem_id, bf_id, bd3_id):
         for p in range(t_packing):
-            offset = p * h_per_packing + bd3_id * bd1_per_packing
+            offset = p * h_per_t_packing + bd3_id * bd1_per_t_packing
             pltpu.make_async_copy(
                 src_ref=w1_hbm.at[
                     local_e_id,
                     1,
-                    pl.ds(offset, bd1_per_packing),
+                    pl.ds(offset, bd1_per_t_packing),
                     pl.ds(bf_id * bf, bf),
                 ],
                 dst_ref=b_w3_x2_vmem.at[bw3_sem_id, p],
                 sem=local_sems.at[bw3_sem_id, 3],
             ).start()
+            if w1_scale_hbm is not None:
+                assert subc_quant_wsz is not None
+                pltpu.make_async_copy(
+                    src_ref=w1_scale_hbm.at[
+                        local_e_id,
+                        1,
+                        pl.ds(
+                            offset // subc_quant_wsz,
+                            bd1_per_t_packing // subc_quant_wsz,
+                        ),
+                        pl.ds(0, 1),
+                        pl.ds(bf_id * bf, bf),
+                    ],
+                    dst_ref=b_w3_scale_x2_vmem.at[bw3_sem_id, p],
+                    sem=local_sems.at[bw3_sem_id, 3],
+                ).start()
+        if b1_hbm is not None and bd3_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b1_hbm.at[local_e_id, 1,
+                                  pl.ds(0, 1),
+                                  pl.ds(bf_id * bf, bf)],
+                dst_ref=b_b3_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).start()
     def wait_fetch_bw1(local_e_id, bw1_sem_id, bf_id, bd1_id):
-        del local_e_id, bf_id, bd1_id
+        del local_e_id
         pltpu.make_async_copy(
             src_ref=b_w1_x2_vmem.at[bw1_sem_id],
             dst_ref=b_w1_x2_vmem.at[bw1_sem_id],
             sem=local_sems.at[bw1_sem_id, 1],
         ).wait()
+        if w1_scale_hbm is not None:
+            pltpu.make_async_copy(
+                src_ref=b_w1_scale_x2_vmem.at[bw1_sem_id],
+                dst_ref=b_w1_scale_x2_vmem.at[bw1_sem_id],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).wait()
+        if b1_hbm is not None and bd1_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b_b1_x2_vmem.at[bf_id % 2],
+                dst_ref=b_b1_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw1_sem_id, 1],
+            ).wait()
     def wait_fetch_bw2(local_e_id, bw2_sem_id, bf_id, bd2_id):
-        del local_e_id, bf_id, bd2_id
+        del local_e_id
         pltpu.make_async_copy(
             src_ref=b_w2_x2_vmem.at[bw2_sem_id],
             dst_ref=b_w2_x2_vmem.at[bw2_sem_id],
             sem=local_sems.at[bw2_sem_id, 2],
         ).wait()
+        if w2_scale_hbm is not None:
+            pltpu.make_async_copy(
+                src_ref=b_w2_scale_x2_vmem.at[bw2_sem_id],
+                dst_ref=b_w2_scale_x2_vmem.at[bw2_sem_id],
+                sem=local_sems.at[bw2_sem_id, 2],
+            ).wait()
+        if b2_hbm is not None and bf_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b_b2_x2_vmem.at[bd2_id % 2],
+                dst_ref=b_b2_x2_vmem.at[bd2_id % 2],
+                sem=local_sems.at[bw2_sem_id, 2],
+            ).wait()
     def wait_fetch_bw3(local_e_id, bw3_sem_id, bf_id, bd3_id):
-        del local_e_id, bf_id, bd3_id
+        del local_e_id
         pltpu.make_async_copy(
             src_ref=b_w3_x2_vmem.at[bw3_sem_id],
             dst_ref=b_w3_x2_vmem.at[bw3_sem_id],
             sem=local_sems.at[bw3_sem_id, 3],
         ).wait()
+        if w1_scale_hbm is not None:
+            pltpu.make_async_copy(
+                src_ref=b_w3_scale_x2_vmem.at[bw3_sem_id],
+                dst_ref=b_w3_scale_x2_vmem.at[bw3_sem_id],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).wait()
+        if b1_hbm is not None and bd3_id == 0:
+            pltpu.make_async_copy(
+                src_ref=b_b3_x2_vmem.at[bf_id % 2],
+                dst_ref=b_b3_x2_vmem.at[bf_id % 2],
+                sem=local_sems.at[bw3_sem_id, 3],
+            ).wait()
     def start_fetch_next_bw(local_e_id, bw_sem_id, bf_id, bd1_id, bd2_id):
         next_bd1_id = bd1_id + 1
@@ -520,18 +731,38 @@ def _fused_ep_moe_kernel(
     def dynamic_ffn1(
         t_b32_vmem,
         w1_vmem,
+        w1_scale_vmem,
+        b1_vmem,
         w3_vmem,
+        w3_scale_vmem,
+        b3_vmem,
         acc1_vmem,
         acc3_vmem,
         dyn_sz,
         should_init,
     ):
         assert t_b32_vmem.shape == (bt * num_devices, bd1 // t_packing)
-        assert w1_vmem.shape == w3_vmem.shape == (t_packing, bd1_per_packing,
+        assert w1_vmem.shape == w3_vmem.shape == (t_packing, bd1_per_t_packing,
                                                   bf)
         assert acc1_vmem.shape == acc3_vmem.shape == (bt * num_devices, bf)
         assert bd1 % (t_packing * 128) == 0, (bd1, t_packing)
         assert bd1c % (t_packing * 128) == 0, (bd1c, t_packing)
+        if w1_scale_vmem is not None:
+            assert w1_scale_vmem.shape == (
+                t_packing,
+                bd1_per_t_packing // subc_quant_wsz,
+                1,
+                bf,
+            )
+            assert bd1c_per_t_packing == subc_quant_wsz
+        if w3_scale_vmem is not None:
+            assert w3_scale_vmem.shape == (
+                t_packing,
+                bd1_per_t_packing // subc_quant_wsz,
+                1,
+                bf,
+            )
+            assert bd1c_per_t_packing == subc_quant_wsz
         num_loops = cdiv(dyn_sz, btc)
         repack_ty = jnp.dtype(f"int{t_bitwidth}")
@@ -540,7 +771,7 @@ def _fused_ep_moe_kernel(
             for bd1c_id in range(cdiv(bd1, bd1c)):
                 t_b32 = t_b32_vmem[
                     pl.ds(btc_id * btc, btc),
-                    pl.ds(bd1c_id * bd1c_per_packing, bd1c_per_packing),
+                    pl.ds(bd1c_id * bd1c_per_t_packing, bd1c_per_t_packing),
                 ]
                 for p_id in range(t_packing):
                     t = pltpu.bitcast(t_b32.astype(repack_ty), t_dtype)
@@ -548,21 +779,64 @@ def _fused_ep_moe_kernel(
                     for bfc_id in range(cdiv(bf, bfc)):
                         w_slices = (
                             p_id,
-                            pl.ds(bd1c_id * bd1c_per_packing,
-                                  bd1c_per_packing),
+                            pl.ds(bd1c_id * bd1c_per_t_packing,
+                                  bd1c_per_t_packing),
                             pl.ds(bfc_id * bfc, bfc),
                         )
                         w1 = w1_vmem[*w_slices]
                         acc1 = jnp.dot(t,
                                        w1,
                                        preferred_element_type=jnp.float32)
+                        if w1_scale_vmem is not None:
+                            w1_scale_slices = (
+                                p_id,
+                                bd1c_id,
+                                pl.ds(0, 1),
+                                pl.ds(bfc_id * bfc, bfc),
+                            )
+                            # TODO(jevinjiang): can use mosaic to load with stride 0.
+                            w1_scale = jnp.broadcast_to(
+                                w1_scale_vmem[*w1_scale_slices], acc1.shape)
+                            acc1 *= w1_scale
                         w3 = w3_vmem[*w_slices]
                         acc3 = jnp.dot(t,
                                        w3,
                                        preferred_element_type=jnp.float32)
+                        if w3_scale_vmem is not None:
+                            w3_scale_slices = (
+                                p_id,
+                                bd1c_id,
+                                pl.ds(0, 1),
+                                pl.ds(bfc_id * bfc, bfc),
+                            )
+                            w3_scale = jnp.broadcast_to(
+                                w3_scale_vmem[*w3_scale_slices], acc3.shape)
+                            acc3 *= w3_scale
                         acc_slices = (pl.ds(btc_id * btc,
                                             btc), pl.ds(bfc_id * bfc, bfc))
                         if should_init and p_id == bd1c_id == 0:
+                            if b1_vmem is not None:
+                                b1_scale_slices = (
+                                    pl.ds(0, 1),
+                                    pl.ds(bfc_id * bfc, bfc),
+                                )
+                                b1 = jnp.broadcast_to(
+                                    b1_vmem[*b1_scale_slices], acc1.shape)
+                                acc1 += b1
+                            if b3_vmem is not None:
+                                b3_scale_slices = (
+                                    pl.ds(0, 1),
+                                    pl.ds(bfc_id * bfc, bfc),
+                                )
+                                b3 = jnp.broadcast_to(
+                                    b3_vmem[*b3_scale_slices], acc1.shape)
+                                acc3 += b3
                             acc1_vmem[*acc_slices] = acc1
                             acc3_vmem[*acc_slices] = acc3
                         else:
@@ -575,22 +849,28 @@ def _fused_ep_moe_kernel(
         acc1_vmem,
         acc3_vmem,
         w2_vmem,
+        w2_scale_vmem,
+        b2_vmem,
         res_b32_vmem,
         dyn_sz,
         should_init,
     ):
-        assert res_b32_vmem.shape == (bt * num_devices, bd2_per_packing)
-        assert w2_vmem.shape == (t_packing, bf, bd2_per_packing), (
-            w2_vmem.shape,
-            t_packing,
-            bf,
-            bd2_per_packing,
-        )
+        assert res_b32_vmem.shape == (bt * num_devices, bd2_per_t_packing)
+        assert w2_vmem.shape == (t_packing, bf, bd2_per_t_packing)
         assert acc1_vmem.shape == acc3_vmem.shape == (bt * num_devices, bf)
         assert bd2 % (t_packing * 128) == 0, (bd2, t_packing)
         assert bd2c % (t_packing * 128) == 0, (bd2c, t_packing)
         assert t_dtype in (jnp.float32, jnp.bfloat16)
+        if w2_scale_vmem is not None:
+            assert w2_scale_vmem.shape == (
+                t_packing,
+                bf // subc_quant_wsz,
+                1,
+                bd2_per_t_packing,
+            )
+            assert bfc == subc_quant_wsz
         num_loops = cdiv(dyn_sz, btc)
         assert bd2c % (t_packing * 128) == 0, (bd2c, t_packing)
@@ -598,22 +878,47 @@ def _fused_ep_moe_kernel(
             for bd2c_id in range(cdiv(bd2, bd2c)):
                 res_lst = []
                 for p_id in range(t_packing):
-                    res = jnp.zeros((btc, bd2c_per_packing), dtype=jnp.float32)
+                    res = jnp.zeros((btc, bd2c_per_t_packing),
+                                    dtype=jnp.float32)
+                    if b2_vmem is not None and should_init:
+                        b2_scale_slices = (
+                            p_id,
+                            pl.ds(0, 1),
+                            pl.ds(bd2c_id * bd2c_per_t_packing,
+                                  bd2c_per_t_packing),
+                        )
+                        b2 = jnp.broadcast_to(b2_vmem[*b2_scale_slices],
+                                              res.shape)
+                        res += b2
                     for bfc_id in range(cdiv(bf, bfc)):
                         acc_slices = (pl.ds(btc_id * btc,
                                             btc), pl.ds(bfc_id * bfc, bfc))
                         acc1 = acc1_vmem[*acc_slices]
                         acc3 = acc3_vmem[*acc_slices]
-                        act = jax.nn.silu(acc1) * acc3
+                        act = activation_fn(acc1, acc3, act_fn)
                         w2 = w2_vmem[
                             p_id,
                             pl.ds(bfc_id * bfc, bfc),
                             pl.ds(bd2c_id *
-                                  bd2c_per_packing, bd2c_per_packing),
+                                  bd2c_per_t_packing, bd2c_per_t_packing),
                         ]
-                        res += jnp.dot(act,
-                                       w2,
-                                       preferred_element_type=jnp.float32)
+                        acc = jnp.dot(act,
+                                      w2,
+                                      preferred_element_type=jnp.float32)
+                        if w2_scale_vmem is not None:
+                            w2_scale_slices = (
+                                p_id,
+                                bfc_id,
+                                pl.ds(0, 1),
+                                pl.ds(bd2c_id * bd2c_per_t_packing,
+                                      bd2c_per_t_packing),
+                            )
+                            w2_scale = jnp.broadcast_to(
+                                w2_scale_vmem[*w2_scale_slices], acc.shape)
+                            acc *= w2_scale
+                        res += acc
                     res = pltpu.bitcast(res, jnp.uint32)
                     if t_packing == 2:
                         res = res >> 16 << (16 * p_id)
@@ -626,7 +931,7 @@ def _fused_ep_moe_kernel(
                     res |= res_lst[i]
                 sliced_res_vmem = res_b32_vmem.at[
                     pl.ds(btc_id * btc, btc),
-                    pl.ds(bd2c_id * bd2c_per_packing, bd2c_per_packing),
+                    pl.ds(bd2c_id * bd2c_per_t_packing, bd2c_per_t_packing),
                 ]
                 if should_init:
                     sliced_res_vmem[...] = res
@@ -655,21 +960,33 @@ def _fused_ep_moe_kernel(
         e_id = my_id * local_num_experts + local_e_id
         dyn_sz = expert_sizes_x2_smem[bt_sem_id, 0, e_id]
-        bd1_per_packing = bd1 // t_packing
-        bd2_per_packing = bd2 // t_packing
+        bd1_per_t_packing = bd1 // t_packing
+        bd2_per_t_packing = bd2 // t_packing
         for bf_id in range(num_bf):
             for bd1_id in range(num_bd1):
                 start_fetch_next_bw(local_e_id, bw_sem_id, bf_id, bd1_id, 0)
+                w1_scale_vmem = (None if b_w1_scale_x2_vmem is None else
+                                 b_w1_scale_x2_vmem.at[bw_sem_id])
+                w3_scale_vmem = (None if b_w3_scale_x2_vmem is None else
+                                 b_w3_scale_x2_vmem.at[bw_sem_id])
+                b1_vmem = None if b_b1_x2_vmem is None else b_b1_x2_vmem.at[
+                    bf_id % 2]
+                b3_vmem = None if b_b3_x2_vmem is None else b_b3_x2_vmem.at[
+                    bf_id % 2]
                 wait_fetch_bw1(local_e_id, bw_sem_id, bf_id, bd1_id)
                 wait_fetch_bw3(local_e_id, bw_sem_id, bf_id, bd1_id)
                 dynamic_ffn1(
                     t_b32_vmem=a2a_s_b32_vmem.at[
                         ...,
-                        pl.ds(bd1_id * bd1_per_packing, bd1_per_packing)],
+                        pl.ds(bd1_id * bd1_per_t_packing, bd1_per_t_packing)],
                     w1_vmem=b_w1_x2_vmem.at[bw_sem_id],
+                    w1_scale_vmem=w1_scale_vmem,
+                    b1_vmem=b1_vmem,
                     w3_vmem=b_w3_x2_vmem.at[bw_sem_id],
+                    w3_scale_vmem=w3_scale_vmem,
+                    b3_vmem=b3_vmem,
                     acc1_vmem=b_acc1_vmem,
                     acc3_vmem=b_acc3_vmem,
                     dyn_sz=dyn_sz,
@@ -684,13 +1001,19 @@ def _fused_ep_moe_kernel(
                 if bf_id == bd2_id == 0:
                     wait_a2a_gather_send(bt_id, e_sem_id, local_e_id - 2)
+                w2_scale_vmem = (None if b_w2_scale_x2_vmem is None else
+                                 b_w2_scale_x2_vmem.at[bw_sem_id])
+                b2_vmem = None if b_b2_x2_vmem is None else b_b2_x2_vmem.at[
+                    bd2_id % 2]
                 dynamic_ffn2(
                     acc1_vmem=b_acc1_vmem,
                     acc3_vmem=b_acc3_vmem,
                     w2_vmem=b_w2_x2_vmem.at[bw_sem_id],
+                    w2_scale_vmem=w2_scale_vmem,
+                    b2_vmem=b2_vmem,
                     res_b32_vmem=a2a_s_acc_b32_vmem.at[
                         ...,
-                        pl.ds(bd2_id * bd2_per_packing, bd2_per_packing)],
+                        pl.ds(bd2_id * bd2_per_t_packing, bd2_per_t_packing)],
                     dyn_sz=dyn_sz,
                     should_init=(bf_id == 0),
                 )
@@ -757,31 +1080,42 @@ def _fused_ep_moe_kernel(
         b_gating = b_gating_x2_vmem[bt_sem_id]
         b_gating_score = jax.nn.softmax(b_gating, axis=-1)
         top_k_logits_lst, t2e_routing, expert_sizes, expert_starts = get_top_k(
-            b_gating_score, top_k)
+            b_gating_score, top_k, renormalize_topk_logits)
         all_reduce_metadata(bt_sem_id, t2e_routing, expert_starts,
                             expert_sizes)
+        sync_barrier()
+        # Start a2a scatter for first active expert.
         start_a2a_scatter(bt_id=bt_id, e_sem_id=e_sem_id, local_e_id=0)
         def run_per_expert(local_e_id, e_sem_id):
             sync_barrier()
+            # Prefetch weights for CURRENT active expert.
+            # TODO(jevinjiang): It is hard to prefetch weights in previous iteration
+            # because the expert_ffn keeps overwriting the buffers. Triple buffering
+            # could resolve this but it takes more VMEM scratch. Need further
+            # experiment on this.
+            start_fetch_bw1(local_e_id, bw1_sem_id=0, bf_id=0, bd1_id=0)
+            start_fetch_bw3(local_e_id, bw3_sem_id=0, bf_id=0, bd3_id=0)
+            # Next ids.
             next_e_sem_id = lax.select(e_sem_id == 0, 1, 0)
             next_local_e_id = local_e_id + 1
+            # Start a2a scatter for NEXT active expert.
             @pl.when(next_local_e_id < local_num_experts)
             def _():
                 start_a2a_scatter(bt_id, next_e_sem_id, next_local_e_id)
-            # Prefetch weights for active expert.
-            start_fetch_bw1(local_e_id, bw1_sem_id=0, bf_id=0, bd1_id=0)
-            start_fetch_bw3(local_e_id, bw3_sem_id=0, bf_id=0, bd3_id=0)
-            # Wait for a2a scatter and perform FFN for active expert.
+            # Wait a2a scatter for CURRENT active expert.
             wait_a2a_scatter_recv(bt_id, e_sem_id, local_e_id)
+            # Perform FFN for CURRENT active expert.
             expert_ffn(bt_id, e_sem_id, local_e_id)
-            # Wait for a2a gather to send back tokens for active expert.
+            # Start a2a gather to send back tokens for CURRENT active expert.
             start_a2a_gather(bt_id, e_sem_id, local_e_id)
             # A must-wait before next sync_barrier.
@@ -794,7 +1128,10 @@ def _fused_ep_moe_kernel(
                                  e_sem_id,
                                  unroll=False)
+        # Wait to receive a2a gather for ALL experts.
         wait_a2a_gather_recv_all()
+        # Accumulate results for current batch.
         output = bt_acc(bt_id, top_k_logits_lst)
         # Make sure it is safe to overwrite output buffer.
@@ -827,6 +1164,9 @@ def _fused_ep_moe_kernel(
     static_argnames=[
         "mesh",
         "top_k",
+        "renormalize_topk_logits",
+        "act_fn",
+        "subc_quant_wsz",
         "bt",
         "bf",
         "bd1",
@@ -846,6 +1186,17 @@ def fused_ep_moe(
     gating_output: jax.Array,  # (num_tokens, num_experts)
     top_k: int,
     *,
+    renormalize_topk_logits: bool = False,
+    act_fn: str = "silu",
+    subc_quant_wsz: int | None = None,
+    w1_scale: (
+        jax.Array | None
+    ) = None,  # F32(num_experts, 2, hidden_size // subc_quant_wsz, 1, intermediate_size)
+    w2_scale: (
+        jax.Array | None
+    ) = None,  # F32(num_experts, intermediate_size // subc_quant_wsz, 1, hidden_size)
+    b1: jax.Array | None = None,  # F32(num_experts, 2, 1, intermediate_size)
+    b2: jax.Array | None = None,  # F32(num_experts, 1, hidden_size)
     # Kernel tuning parameters.
     bt: int,
     bf: int,
@@ -855,52 +1206,164 @@ def fused_ep_moe(
     bfc: int,
     bd1c: int,
     bd2c: int,
-    ep_axis_name: str = 'model',
+    ep_axis_name: str = "model",
 ):
-    # Assert all other axes have length of 1
-    assert len(mesh.shape) == 2, "Expect 2D mesh in tpu-inference"
-    assert 'data' in mesh.shape and mesh.shape['data'] == 1, \
-        "Expect data axis size of 1 in tpu-inference"
+    # TODO(jevinjiang): move all these assertions to validation function.
+    if len(mesh.shape) != 2:
+        raise NotImplementedError("Only 2D mesh is supported.")
+    for axis_name in mesh.axis_names:
+        if axis_name == ep_axis_name:
+            continue
+        if mesh.shape[axis_name] != 1:
+            raise NotImplementedError(
+                f"Expected all non-ep axis to have size 1 in {mesh.shape=}")
     ep_size = mesh.shape[ep_axis_name]
     num_devices = ep_size
-    num_tokens, actual_hidden_size = tokens.shape
+    num_tokens, hidden_size = tokens.shape
     num_experts, intermediate_size, _ = w2.shape
-    assert num_tokens % ep_size == 0
-    assert num_experts % ep_size == 0
+    if w1.shape != (num_experts, 2, hidden_size, intermediate_size):
+        raise ValueError(
+            f"Expected {w1.shape=} to be"
+            f" {(num_experts, 2, hidden_size, intermediate_size)}.")
+    if w2.shape != (num_experts, intermediate_size, hidden_size):
+        raise ValueError(f"Expected {w2.shape=} to be"
+                         f" {(num_experts, intermediate_size, hidden_size)}.")
+    if gating_output.shape != (num_tokens, num_experts):
+        raise ValueError(
+            f"Expected {gating_output.shape=} to be {(num_tokens, num_experts)}."
+        )
+    if not (0 < top_k <= num_experts):
+        raise ValueError(
+            f"Expected {top_k=} to be in range (0, {num_experts=}].")
+    if hidden_size % 128 != 0 or intermediate_size % 128 != 0:
+        raise ValueError(
+            f"Expected {hidden_size=} and {intermediate_size=} to be aligned to"
+            " 128. Did you pad them with zeros outside the kernel?")
+    if num_tokens % ep_size != 0:
+        raise ValueError(
+            f"Expected {num_tokens=} to be aligned to {ep_size=}.")
+    if num_experts % ep_size != 0:
+        raise ValueError(
+            f"Expected {num_experts=} to be aligned to {ep_size=}.")
     local_num_tokens = num_tokens // ep_size
     # local_num_experts = num_experts // ep_size
     padded_num_experts = align_to(num_experts, 128)
+    padded_top_k = align_to(top_k, 128)
     t_dtype = tokens.dtype
     t_packing = get_dtype_packing(t_dtype)
-    hidden_size = align_to(actual_hidden_size, 128 * t_packing)
-    if hidden_size != actual_hidden_size:
-        tokens = jnp.pad(
-            tokens,
-            ((0, 0), (0, hidden_size - actual_hidden_size)),
-            constant_values=0,
-        )
-    tokens = tokens.reshape(-1, t_packing, hidden_size // t_packing)
-    bt = min(bt, local_num_tokens)
-    bf = min(bf, intermediate_size)
-    bd1 = min(bd1, hidden_size)
-    bd2 = min(bd2, hidden_size)
-    btc = min(btc, bt * num_devices)
-    bfc = min(bfc, bf)
-    bd1c = min(bd1c, bd1)
-    bd2c = min(bd2c, bd2)
-    assert bfc % 128 == 0
-    assert bd1c % (t_packing * 128) == 0
-    assert bd2c % (t_packing * 128) == 0
-    assert bf % bfc == 0
-    assert bd1 % bd1c == 0
-    assert bd2 % bd2c == 0
+    # Override bt
+    if local_num_tokens <= t_packing * 8:
+        bt = local_num_tokens
+        btc = bt
+    bt = min(local_num_tokens, bt)
+    # The worst case is that all devices send bt to one device.
+    btc = min(bt, btc, bt * num_devices)
+    if local_num_tokens % t_packing != 0:
+        raise ValueError(
+            f"Expected {local_num_tokens=} to be aligned to {t_packing=}.")
+    if bt % t_packing != 0:
+        raise ValueError(f"Expected {bt=} to be aligned to {t_packing=}.")
+    if local_num_tokens % bt != 0:
+        raise ValueError(
+            f"Expected {local_num_tokens=} to be aligned to {bt=}.")
+    if subc_quant_wsz is not None:
+        if subc_quant_wsz <= 0:
+            raise ValueError(f"Expected {subc_quant_wsz=} to be non-negative.")
+        if subc_quant_wsz % 256 != 0:
+            raise ValueError(
+                "Expected {subc_quant_wsz=} to be aligned to 256.")
+        if hidden_size % subc_quant_wsz != 0:
+            raise ValueError(
+                f"Expected {hidden_size=} to be aligned to {subc_quant_wsz=}.")
+        if intermediate_size % subc_quant_wsz != 0:
+            raise ValueError(
+                f"Expected {intermediate_size=} to be aligned to {subc_quant_wsz=}."
+            )
+        # We force compute size of contracting dim to be subc_quant_wsz. So we can
+        # apply same scale after matmul and accumulation.
+        bd1c = subc_quant_wsz * t_packing
+        bfc = subc_quant_wsz
+    if bfc % 128 != 0:
+        raise ValueError(f"Expected {bfc=} to be aligned to 128.")
+    if bd1c % (t_packing * 128) != 0:
+        raise ValueError(
+            f"Expected {bd1c=} to be aligned to {t_packing * 128}.")
+    if bd2c % (t_packing * 128) != 0:
+        raise ValueError(
+            f"Expected {bd2c=} to be aligned to {t_packing * 128}.")
+    if bf % bfc != 0:
+        raise ValueError(f"Expected {bf=} to be aligned to {bfc=}.")
+    if bd1 % bd1c != 0:
+        raise ValueError(f"Expected {bd1=} to be aligned to {bd1c=}.")
+    if bd2 % bd2c != 0:
+        raise ValueError(f"Expected {bd2=} to be aligned to {bd2c=}.")
+    if hidden_size % bd1 != 0 or hidden_size % bd2 != 0:
+        raise ValueError(
+            f"Expected {hidden_size=} to be aligned to {bd1=} and {bd2=}.")
+    if intermediate_size % bf != 0:
+        raise ValueError(
+            f"Expected {intermediate_size=} to be aligned to {bf=}.")
+    # Note: we should dump scale as the kernel expected shape in the
+    # checkpoint offline or reshape right after weight loading.
+    if w1_scale is not None:
+        expected_w1_scale_shape = (
+            num_experts,
+            2,
+            hidden_size // subc_quant_wsz,
+            1,
+            intermediate_size,
+        )
+        if w1_scale.shape != expected_w1_scale_shape:
+            raise ValueError(
+                f"Expected {w1_scale.shape=} to be {expected_w1_scale_shape}.")
+        if w1_scale.dtype != jnp.float32:
+            w1_scale = w1_scale.astype(jnp.float32)
+    if w2_scale is not None:
+        expected_w2_scale_shape = (
+            num_experts,
+            intermediate_size // subc_quant_wsz,
+            1,
+            hidden_size,
+        )
+        if w2_scale.shape != expected_w2_scale_shape:
+            raise ValueError(
+                f"Expected {w2_scale.shape=} to be {expected_w2_scale_shape}.")
+        if w2_scale.dtype != jnp.float32:
+            w2_scale = w2_scale.astype(jnp.float32)
+    if b1 is not None:
+        expected_b1_shape = (num_experts, 2, 1, intermediate_size)
+        if b1.shape != expected_b1_shape:
+            raise ValueError(
+                f"Expected {b1.shape=} to be {expected_b1_shape}.")
+        if b1.dtype != jnp.float32:
+            b1 = b1.astype(jnp.float32)
+    if b2 is not None:
+        expected_b2_shape = (num_experts, 1, hidden_size)
+        if b2.shape != expected_b2_shape:
+            raise ValueError(
+                f"Expected {b2.shape=} to be {expected_b2_shape}.")
+        if b2.dtype != jnp.float32:
+            b2 = b2.astype(jnp.float32)
+    # Prepare inputs for the kernel.
     if padded_num_experts != gating_output.shape[-1]:
         gating_output = jnp.pad(
             gating_output,
@@ -908,13 +1371,20 @@ def fused_ep_moe(
             constant_values=-jnp.inf,
         )
-    scope_name = f"fused_moe_k-{top_k}_bt-{bt}-{btc}_bf-{bf}-{bfc}_bd1-{bd1}-{bd1c}_bd2-{bd2}-{bd2c}"
+    tokens = tokens.reshape(-1, t_packing, hidden_size // t_packing)
+    hbm_block_spec = pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM)
+    renorm_str = "-renorm_k" if renormalize_topk_logits else ""
+    scope_name = f"fused-moe-k_{top_k}{renorm_str}-bt_{bt}_{btc}-bf_{bf}_{bfc}-bd1_{bd1}_{bd1c}-bd2_{bd2}_{bd2c}"
     fused_moe = jax.named_scope(scope_name)(
         pl.pallas_call(
             functools.partial(
                 _fused_ep_moe_kernel,
                 top_k=top_k,
+                renormalize_topk_logits=renormalize_topk_logits,
                 ep_axis_name=ep_axis_name,
+                act_fn=act_fn,
+                subc_quant_wsz=subc_quant_wsz,
                 bt=bt,
                 bf=bf,
                 bd1=bd1,
@@ -929,16 +1399,22 @@ def fused_ep_moe(
             grid_spec=pltpu.PrefetchScalarGridSpec(
                 num_scalar_prefetch=0,
                 in_specs=[
-                    pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
-                    pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
-                    pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
-                    pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
-                    pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+                    hbm_block_spec,  # tokens_hbm
+                    hbm_block_spec,  # w1_hbm
+                    hbm_block_spec,  # w2_hbm
+                    None
+                    if w1_scale is None else hbm_block_spec,  # w1_scale_hbm
+                    None
+                    if w2_scale is None else hbm_block_spec,  # w2_scale_hbm
+                    None if b1 is None else hbm_block_spec,  # b1_hbm
+                    None if b2 is None else hbm_block_spec,  # b2_hbm
+                    hbm_block_spec,  # gating_output_hbm
+                    hbm_block_spec,  # a2a_g_hbm
                 ],
                 out_specs=pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
                 scratch_shapes=([
                     # t2e_routing_x2_smem
-                    pltpu.SMEM((2, bt, padded_num_experts), jnp.int32),
+                    pltpu.SMEM((2, bt, padded_top_k), jnp.int32),
                     # d2e_count_x2_smem
                     pltpu.SMEM((2, num_devices, 1, padded_num_experts),
                                jnp.int32),
@@ -984,6 +1460,67 @@ def fused_ep_moe(
                     pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
                     # b_w2_x2_vmem
                     pltpu.VMEM((2, t_packing, bf, bd2 // t_packing), w2.dtype),
+                    # b_w1_scale_x2_vmem
+                    (None if w1_scale is None else pltpu.VMEM(
+                        (
+                            2,
+                            t_packing,
+                            bd1 // t_packing // subc_quant_wsz,
+                            1,
+                            bf,
+                        ),
+                        jnp.float32,
+                    )),
+                    # b_w3_scale_x2_vmem
+                    (None if w1_scale is None else pltpu.VMEM(
+                        (
+                            2,
+                            t_packing,
+                            bd1 // t_packing // subc_quant_wsz,
+                            1,
+                            bf,
+                        ),
+                        jnp.float32,
+                    )),
+                    # b_w2_scale_x2_vmem
+                    (None if w2_scale is None else pltpu.VMEM(
+                        (
+                            2,
+                            t_packing,
+                            bf // subc_quant_wsz,
+                            1,
+                            bd2 // t_packing,
+                        ),
+                        jnp.float32,
+                    )),
+                    # b_b1_x2_vmem
+                    (None if b1 is None else pltpu.VMEM(
+                        (
+                            2,
+                            1,
+                            bf,
+                        ),
+                        jnp.float32,
+                    )),
+                    # b_b3_x2_vmem
+                    (None if b1 is None else pltpu.VMEM(
+                        (
+                            2,
+                            1,
+                            bf,
+                        ),
+                        jnp.float32,
+                    )),
+                    # b_b2_x2_vmem
+                    (None if b2 is None else pltpu.VMEM(
+                        (
+                            2,
+                            t_packing,
+                            1,
+                            bd2 // t_packing,
+                        ),
+                        jnp.float32,
+                    )),
                     # b_acc_vmem
                     pltpu.VMEM((bt * num_devices, 1, bf * 2), jnp.float32),
                     # local_sems
@@ -1006,30 +1543,62 @@ def fused_ep_moe(
         ))
     @jax.jit
-    @functools.partial(
-        shard_map.shard_map,
+    @jax.shard_map(
         mesh=mesh,
-        in_specs=(P(ep_axis_name), P(ep_axis_name), P(ep_axis_name),
-                  P(ep_axis_name), P()),
+        in_specs=(
+            P(ep_axis_name),  # tokens_hbm
+            P(ep_axis_name),  # w1_hbm
+            P(ep_axis_name),  # w2_hbm
+            None if w1_scale is None else P(ep_axis_name),  # w1_scale_hbm
+            None if w2_scale is None else P(ep_axis_name),  # w2_scale_hbm
+            None if b1 is None else P(ep_axis_name),  # b1_hbm
+            None if b2 is None else P(ep_axis_name),  # b2_hbm
+            P(ep_axis_name),  # gating_output_hbm
+            P(),  # a2a_g_hbm
+        ),
         out_specs=P(ep_axis_name),
-        check_rep=False,
+        check_vma=False,
     )
-    def kernel(tokens, w1, w2, gating_output, a2a_g_hbm_scratch):
+    def kernel(
+        tokens,
+        w1,
+        w2,
+        w1_scale,
+        w2_scale,
+        b1,
+        b2,
+        gating_output,
+        a2a_g_hbm_scratch,
+    ):
         return fused_moe(
-            pltpu.with_memory_space_constraint(tokens, pltpu.HBM),
-            pltpu.with_memory_space_constraint(w1, pltpu.HBM),
-            pltpu.with_memory_space_constraint(w2, pltpu.HBM),
-            pltpu.with_memory_space_constraint(gating_output, pltpu.HBM),
-            pltpu.with_memory_space_constraint(a2a_g_hbm_scratch, pltpu.HBM),
+            pltpu.with_memory_space_constraint(tokens,
+                                               pltpu.HBM),  # tokens_hbm
+            pltpu.with_memory_space_constraint(w1, pltpu.HBM),  # w1_hbm
+            pltpu.with_memory_space_constraint(w2, pltpu.HBM),  # w2_hbm
+            (None if w1_scale is None else pltpu.with_memory_space_constraint(
+                w1_scale, pltpu.HBM)),  # w1_scale_hbm
+            (None if w2_scale is None else pltpu.with_memory_space_constraint(
+                w2_scale, pltpu.HBM)),  # w2_scale_hbm
+            (None if b1 is None else pltpu.with_memory_space_constraint(
+                b1, pltpu.HBM)),  # b1_hbm
+            (None if b2 is None else pltpu.with_memory_space_constraint(
+                b2, pltpu.HBM)),  # b2_hbm
+            pltpu.with_memory_space_constraint(gating_output,
+                                               pltpu.HBM),  # gating_output_hbm
+            pltpu.with_memory_space_constraint(a2a_g_hbm_scratch,
+                                               pltpu.HBM),  # a2a_g_hbm
         )
     a2a_g_hbm_scratch = pl.empty(
         (num_experts, bt, t_packing, hidden_size // t_packing), t_dtype)
-    results = kernel(
+    return kernel(
         tokens,
         w1,
         w2,
+        w1_scale,
+        w2_scale,
+        b1,
+        b2,
         gating_output,
         a2a_g_hbm_scratch,
     )
-    return results[:, :actual_hidden_size]

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl