PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511180814__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511180814py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (76) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -7
tests/lora/test_lora_perf.py +53 -0
tests/lora/utils.py +0 -8
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/__init__.py +22 -3
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +3 -4
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +93 -9
tpu_inference/executors/ray_distributed_executor.py +9 -2
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +140 -67
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +204 -120
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +84 -28
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama3.py +2 -1
tpu_inference/models/jax/llama_eagle3.py +8 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +2 -1
tpu_inference/models/jax/qwen2_5_vl.py +163 -48
tpu_inference/models/jax/qwen3.py +2 -1
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +205 -144
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -8
tpu_inference/platforms/tpu_platform.py +34 -50
tpu_inference/runner/compilation_manager.py +144 -60
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +48 -33
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +280 -149
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -21
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +46 -18
tpu_inference/worker/tpu_worker.py +197 -63
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +9 -10
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +70 -74
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511180814.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from typing import Callable, Optional, Union
+from typing import Union
 import jax
 import jax.numpy as jnp
 import torch
 import torch.nn.functional as F
+from compressed_tensors.quantization import QuantizationArgs
 from jax.experimental.layout import Format, Layout
 from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
@@ -12,52 +13,89 @@ from torchax.interop import call_jax, torch_view
 from torchax.ops.mappings import t2j
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import \
-    CompressedTensorsConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import \
-    CompressedTensorsW8A8Fp8MoEMethod
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
-    WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+    CompressedTensorsMoEMethod, CompressedTensorsW8A8Fp8MoEMethod)
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedFusedMoEMethod
 logger = init_logger(__name__)
+class VllmCompressedTensorsMoEMethod(CompressedTensorsMoEMethod):
+    @staticmethod
+    def get_moe_method(
+        quant_config: "VllmCompressedTensorsConfig",  # type: ignore # noqa E501
+        layer: torch.nn.Module,
+        layer_name: str,
+    ) -> CompressedTensorsMoEMethod:
+        assert isinstance(layer, FusedMoE)
+        # FusedMoE was made by combining multiple Linears so need to
+        # make sure quantization config for Linear can target it
+        quant_config._add_fused_moe_to_target_scheme_map()
+        unfused_names = [
+            layer_name + proj_name
+            for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
+        ]
+        # TODO: refactor this to use expert_mapping and check all layer numbers
+        all_scheme_dicts = [
+            quant_config.get_scheme_dict(layer, name) for name in unfused_names
+        ]
+        scheme_dict = all_scheme_dicts.pop()
+        # multiple schemes found
+        if not all([cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
+            raise ValueError("All MoE projections need to have same "
+                             "quantization scheme but found multiple")
+        if scheme_dict is None:
+            return VllmUnquantizedFusedMoEMethod(layer.moe_config,
+                                                 quant_config.mesh)
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
+        if quant_config._is_fp8_w8a8(weight_quant, input_quant):
+            return VllmCompressedTensorsW8A8Fp8MoEMethod(
+                weight_quant, input_quant, layer.moe_config, quant_config.mesh)
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
 class VllmCompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsW8A8Fp8MoEMethod,
                                             JaxCommonConfig):
-    def __init__(self, quant_config: "CompressedTensorsConfig",
-                 moe: FusedMoEConfig, mesh: Mesh):
-        super().__init__(quant_config, moe)
+    def __init__(self, weight_quant: QuantizationArgs,
+                 input_quant: QuantizationArgs, moe: FusedMoEConfig,
+                 mesh: Mesh):
+        super().__init__(weight_quant, input_quant, moe)
         self.mesh = mesh
-        self.quant_config = quant_config
-        # disable GPU paths
-        self.use_marlin = False
-        self.rocm_aiter_moe_enabled = False  # is_rocm_aiter_moe_enabled()
-        self.is_fp8_w8a8_sm100 = False
-        self.use_cutlass = False
-        self.disable_expert_map = False
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
-        intermediate_size = layer.w13_weight.shape[1] // 2
-        w1_weight = layer.w13_weight[:, :intermediate_size]
-        w3_weight = layer.w13_weight[:, intermediate_size:]
-        w1_weight_scale = layer.w13_weight_scale[:, :intermediate_size]
-        w3_weight_scale = layer.w13_weight_scale[:, intermediate_size:]
+        w13_weight = t2j(layer.w13_weight, use_dlpack=False)
+        w13_weight_scale = t2j(layer.w13_weight_scale, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-        w2_weight_scale = t2j(layer.w2_weight_scale.to(torch.bfloat16),
-                              use_dlpack=False)
-        w1_weight = t2j(w1_weight, use_dlpack=False)
-        w1_weight_scale = t2j(w1_weight_scale.to(torch.bfloat16),
-                              use_dlpack=False)
-        w3_weight = t2j(w3_weight, use_dlpack=False)
-        w3_weight_scale = t2j(w3_weight_scale.to(torch.bfloat16),
-                              use_dlpack=False)
+        w2_weight_scale = t2j(layer.w2_weight_scale, use_dlpack=False)
+        w13_weight_scale = w13_weight_scale.astype(jnp.bfloat16)
+        w2_weight_scale = w2_weight_scale.astype(jnp.bfloat16)
+        num_experts, hidden_size, intermediate_size = w2_weight.shape
+        assert w2_weight_scale.shape == (num_experts, hidden_size, 1)
+        assert w13_weight.shape == (num_experts, 2 * intermediate_size,
+                                    hidden_size)
+        assert w13_weight_scale.shape == (num_experts, 2 * intermediate_size,
+                                          1)
+        w1_weight, w3_weight = jnp.split(w13_weight, 2, 1)
+        w1_weight_scale, w3_weight_scale = jnp.split(w13_weight_scale, 2, 1)
         if layer.use_ep:
             format = Format(Layout((0, 1, 2)),
@@ -69,16 +107,9 @@ class VllmCompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsW8A8Fp8MoEMethod,
             w2_weight = jax.device_put(w2_weight, format)
             w2_weight_scale = jax.device_put(w2_weight_scale, format)
         else:
-            assert intermediate_size == w2_weight.shape[-1]
             n_shards = self.mesh.shape["model"]
             assert intermediate_size % n_shards == 0
-            # TODO: enable this if using fused weights
-            # output_sizes = [intermediate_size, intermediate_size]
-            # w13_weight = reorder_concatenated_tensor_for_sharding(
-            #    w13_weight, output_sizes, n_shards, dim=1
-            # )
             w13_format = Format(
                 Layout((0, 1, 2)),
                 NamedSharding(self.mesh, P(None, "model", None)))
@@ -119,45 +150,23 @@ class VllmCompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsW8A8Fp8MoEMethod,
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert isinstance(layer, FusedMoE)
-        if activation != "silu":
+        if layer.activation != "silu":
             raise NotImplementedError(
                 "Only silu is supported for activation function.")
-        if scoring_func != "softmax":
+        if layer.scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        # import sys
-        # sys.stdin = open(0)
-        # breakpoint()
         # TODO: Use MoE kernel when it supports fp8
         seqlen = x.shape[0]
         expert_weights = F.softmax(router_logits, dim=-1)
         expert_weights, expert_indices = torch.topk(expert_weights,
-                                                    top_k,
+                                                    layer.top_k,
                                                     dim=-1)
-        if renormalize:
+        if layer.renormalize:
             expert_weights /= expert_weights.sum(dim=-1, keepdim=True)
         # cond ffn

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 import jax
 import jax.numpy as jnp
@@ -24,9 +24,11 @@ from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
 from vllm.model_executor.layers.quantization.utils.quant_utils import \
     is_layer_skipped
+from tpu_inference import envs
+from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (MXFP4,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func
 from tpu_inference.layers.vllm.linear_common import \
     reorder_concatenated_tensor_for_sharding
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
@@ -85,17 +87,14 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
                     fused_mapping=self.packed_modules_mapping,
             ):
                 return VllmUnquantizedLinearMethod(linear_config)
-            # TODO: Add support for MXFP4 Linear Method.
-            # MXFP4 LinearMethod is available in AMD-Quark, refer to that
-            # implementation if you are interested in enabling MXFP4 here.
             logger.warning_once(
                 "MXFP4 linear layer is not implemented - falling back to "
                 "UnquantizedLinearMethod.")
             return VllmUnquantizedLinearMethod(linear_config)
         elif isinstance(layer, FusedMoE):
-            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+            moe_config = self.get_moe_config(layer)
+            return VllmMxfp4MoEMethod(moe_config, self.mesh)
         elif isinstance(layer, Attention):
-            # TODO: Add support for MXFP4 Attention.
             logger.warning_once("MXFP4 attention layer is not implemented. "
                                 "Skipping quantization for this layer.")
         return None
@@ -103,13 +102,30 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
 class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
-    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+    def __init__(self,
+                 moe: FusedMoEConfig,
+                 mesh: Mesh,
+                 ep_axis_name: str = 'model'):
         FusedMoEMethodBase.__init__(self, moe)
         # We piggyback on triton implementation as it applies minimal hardware
         # specific post processing to the weights.
         self.mxfp4_backend = Mxfp4Backend.TRITON
         self.mesh = mesh
+        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
+        self.ep_axis_name = ep_axis_name
+        # TODO: Use autotune table once we have it.
+        self.block_size = {
+            "bt": 64,
+            "bf": 1024,
+            "bd1": 1536,
+            "bd2": 1536,
+            "btc": 64,
+            "bfc": 1024,
+            "bd1c": 1536,
+            "bd2c": 1536,
+        }
     def get_fused_moe_quant_config(
             self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
@@ -122,6 +138,7 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module):
         assert isinstance(layer, FusedMoE)
+        assert layer.moe_config.has_bias, "mxfp4 quantization alwyas use bias."
         w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
         w13_weight_scale = e8m0_to_fp32(
@@ -140,6 +157,8 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
         w2_weight = dequantize_block_weight(w2_weight, w2_weight_scale,
                                             MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        num_experts, hidden_size, intermediate_size = w2_weight.shape
         # Because we have dequantized weights, scales are not used anymore.
         delattr(layer, "w13_weight_scale")
         delattr(layer, "w2_weight_scale")
@@ -157,110 +176,137 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             w3_bias = w13_bias[:, 1::2]
             w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
-        if layer.use_ep:
+        if self.use_kernel:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            w13_reshaped = w13_weight.reshape(num_experts, 2,
+                                              intermediate_size, hidden_size)
+            # Transpose non-constracting dim to right most dim
+            w13_weight_transposed = jnp.swapaxes(w13_reshaped, 2, 3)
+            w2_weight_transposed = jnp.swapaxes(w2_weight, 1, 2)
+            # Apply EP sharding
+            ep_sharding = NamedSharding(self.mesh, P("model"))
             w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
+                w13_weight_transposed, Format(Layout((0, 1, 2, 3)),
+                                              ep_sharding))
+            w2_weight = jax.device_put(w2_weight_transposed,
+                                       Format(Layout((0, 1, 2)), ep_sharding))
+            w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+            w13_bias = jax.device_put(w13_bias,
+                                      Format(Layout((0, 1, 2)), ep_sharding))
+            w2_bias = jax.device_put(w2_bias,
+                                     Format(Layout((0, 1)), ep_sharding))
         else:
-            intermediate_size = w13_weight.shape[1] // 2
-            assert intermediate_size == w2_weight.shape[-1]
-            output_sizes = [intermediate_size, intermediate_size]
-            n_shards = self.mesh.shape["model"]
-            assert intermediate_size % n_shards == 0
-            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
-                                                                  output_sizes,
-                                                                  n_shards,
-                                                                  dim=1)
-            w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, "model", None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, None, "model"))))
-            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
-                                                                output_sizes,
-                                                                n_shards,
-                                                                dim=1)
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P(None, "model"))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
-                                                                  None))))
+            if layer.use_ep:
+                ep_sharding = NamedSharding(self.mesh, P("model"))
+                w13_weight = jax.device_put(
+                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                w2_weight = jax.device_put(
+                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                w13_bias = jax.device_put(w13_bias,
+                                          Format(Layout((0, 1)), ep_sharding))
+                w2_bias = jax.device_put(w2_bias,
+                                         Format(Layout((0, 1)), ep_sharding))
+            else:
+                output_sizes = [intermediate_size, intermediate_size]
+                n_shards = self.mesh.shape["model"]
+                assert intermediate_size % n_shards == 0
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight,
+                    output_sizes,
+                    n_shards,
+                    dim=1,
+                )
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, "model", None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, None, "model"))))
+                w13_bias = reorder_concatenated_tensor_for_sharding(
+                    w13_bias,
+                    output_sizes,
+                    n_shards,
+                    dim=1,
+                )
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P(None, "model"))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P(None, None))))
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
-        layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
         layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
-        layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
-        pass
+        layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
+        layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert isinstance(layer, FusedMoE)
-        if scoring_func != "softmax":
+        if layer.scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        # Use the original implementation
-        output = fused_moe_func_padded(
-            jax_view(x),
-            jax_view(layer.w13_weight),
-            jax_view(layer.w2_weight),
-            jax_view(layer.w13_bias) if self.moe.has_bias else None,
-            jax_view(layer.w2_bias) if self.moe.has_bias else None,
-            jax_view(router_logits),
-            topk=top_k,
-            global_num_experts=global_num_experts,
-            renormalize=renormalize,
-            reduce_results=layer.reduce_results,
-            mesh=self.mesh,
-            use_ep=layer.use_ep,
-            activation=activation,
-        )
+        x = jax_view(x)
+        w13_weight = jax_view(layer.w13_weight)
+        w2_weight = jax_view(layer.w2_weight)
+        w13_bias = jax_view(layer.w13_bias)
+        w2_bias = jax_view(layer.w2_bias)
+        gating_output = jax_view(router_logits)
+        if self.use_kernel:
+            output = fused_ep_moe(
+                mesh=self.mesh,
+                tokens=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                b1=w13_bias,
+                b2=w2_bias,
+                gating_output=gating_output,
+                top_k=layer.top_k,
+                ep_axis_name=self.ep_axis_name,
+                renormalize_topk_logits=layer.renormalize,
+                act_fn=layer.activation,
+                **self.block_size,
+            )
+        else:
+            output = fused_moe_func(
+                hidden_states=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                w1_bias=w13_bias,
+                w2_bias=w2_bias,
+                gating_output=gating_output,
+                topk=layer.top_k,
+                renormalize=layer.renormalize,
+                mesh=self.mesh,
+                use_ep=layer.use_ep,
+                activation=layer.activation,
+            )
         return torch_view(output)

tpu-inference 0.11.1.dev202511180814__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511180814py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl