PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -1,16 +1,29 @@
-from typing import Callable, Optional, Union
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
 import jax
 import jax.numpy as jnp
 import torch
-from jax.experimental.layout import Format, Layout
-from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from jax.sharding import Mesh, PartitionSpec
 from torch.nn.parameter import Parameter
-from torchax.interop import jax_view, torch_view
+from torchax.interop import torch_view
 from torchax.ops.mappings import t2j
-from vllm.logger import init_logger
+from vllm.attention.layer import Attention
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig, FusedMoEQuantConfig, biased_moe_quant_config)
+    FusedMoEConfig, FusedMoEQuantConfig, mxfp4_w4a16_moe_quant_config)
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase
@@ -26,48 +39,30 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import \
 from tpu_inference.layers.common.quant_methods import (MXFP4,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
-from tpu_inference.layers.vllm.linear_common import \
-    reorder_concatenated_tensor_for_sharding
-from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
+from tpu_inference.layers.common.quantization import \
+    dequantize_tensor_from_mxfp4_packed
+from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.vllm.fused_moe import (FusedMoEBackend,
+                                                 fused_moe_apply,
+                                                 select_moe_backend)
+from tpu_inference.layers.vllm.process_weights.fused_moe_weights import (
+    FusedMoEWeights, process_moe_weights, quantize_moe_weights,
+    shard_moe_weights)
+from tpu_inference.layers.vllm.quantization.configs import VllmQuantConfig
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedLinearMethod
+from tpu_inference.logger import init_logger
+from tpu_inference.utils import get_mesh_shape_product
-MXFP4_BLOCK_SIZE = 32
+REQUANTIZED_BLOCK_SIZE = 512
 P = PartitionSpec
-logger = init_logger(__name__)
-# TODO(kyuyeunk): Move these functions into a common utility file.
-def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
-    assert u8_packed_e2m1.dtype == jnp.uint8
-    e2m1 = jax.lax.bitcast_convert_type(u8_packed_e2m1, jnp.float4_e2m1fn)
-    # bitcast creates one more dimension that splits 8 bits into two e2m1.
-    # we flatten them with the last dim.
-    return jnp.reshape(e2m1, e2m1.shape[:-2] + (-1, ))
-def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
-    e8_finfo = jnp.finfo(jnp.float8_e8m0fnu)
-    exponents = u8.astype(jnp.int32) + e8_finfo.minexp
-    ones = jnp.ones_like(u8, dtype=jnp.float32)
-    return jnp.ldexp(ones, exponents)
-def dequantize_block_weight(weight: jax.Array,
-                            scale: jax.Array,
-                            block_size: int,
-                            out_dtype: jnp.dtype = jnp.bfloat16) -> jax.Array:
-    orig_shape = weight.shape
-    weight_block = weight.reshape(orig_shape[:-1] + (-1, block_size))
-    weight_dequantized = weight_block.astype(jnp.float32) * jnp.expand_dims(
-        scale, -1)
-    return weight_dequantized.reshape(orig_shape).astype(out_dtype)
+logger = init_logger(__name__)
 @register_quantization_config(get_tpu_quant_method(MXFP4))
-class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
+class VllmMxfp4Config(Mxfp4Config, VllmQuantConfig):
     @classmethod
     def get_name(cls):
@@ -75,7 +70,6 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
             linear_config = self.get_linear_config(layer)
@@ -85,17 +79,14 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
                     fused_mapping=self.packed_modules_mapping,
             ):
                 return VllmUnquantizedLinearMethod(linear_config)
-            # TODO: Add support for MXFP4 Linear Method.
-            # MXFP4 LinearMethod is available in AMD-Quark, refer to that
-            # implementation if you are interested in enabling MXFP4 here.
             logger.warning_once(
                 "MXFP4 linear layer is not implemented - falling back to "
                 "UnquantizedLinearMethod.")
             return VllmUnquantizedLinearMethod(linear_config)
         elif isinstance(layer, FusedMoE):
-            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+            moe_config = self.get_moe_config(layer)
+            return VllmMxfp4MoEMethod(moe_config, self.mesh)
         elif isinstance(layer, Attention):
-            # TODO: Add support for MXFP4 Attention.
             logger.warning_once("MXFP4 attention layer is not implemented. "
                                 "Skipping quantization for this layer.")
         return None
@@ -103,164 +94,132 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
 class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
-    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+    def __init__(
+        self,
+        moe: FusedMoEConfig,
+        mesh: Mesh,
+        ep_axis_name: str = "model",
+    ):
         FusedMoEMethodBase.__init__(self, moe)
         # We piggyback on triton implementation as it applies minimal hardware
         # specific post processing to the weights.
         self.mxfp4_backend = Mxfp4Backend.TRITON
         self.mesh = mesh
+        self.moe_backend = select_moe_backend(self.moe)
+        self.extra_backend_kwargs = {}
+        if self.moe_backend == FusedMoEBackend.FUSED_MOE:
+            # When fused moe kernle is used, we pass extra arguments like
+            # tuned block sizes to the kernel.
+            self.extra_backend_kwargs = dict(
+                subc_quant_wsz=REQUANTIZED_BLOCK_SIZE,
+                ep_axis_name=ep_axis_name,
+                # TODO: Use autotune table once we have it.
+                bt=256,
+                bf=1024,
+                bd1=1024,
+                bd2=1024,
+                btc=256,
+                bfc=1024,
+                bd1c=1024,
+                bd2c=1024,
+            )
     def get_fused_moe_quant_config(
             self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
-        # Because we have dequantized weights, we only need biased moe config.
-        # TODO(kyuyeunk): Add native support for MXFP4.
-        return biased_moe_quant_config(
-            layer.w13_bias,
-            layer.w2_bias,
+        return mxfp4_w4a16_moe_quant_config(
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            w1_bias=layer.w13_bias,
+            w2_bias=layer.w2_bias,
         )
     def process_weights_after_loading(self, layer: torch.nn.Module):
         assert isinstance(layer, FusedMoE)
+        assert layer.moe_config.has_bias, "mxfp4 quantization alwyas use bias."
-        w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
-        w13_weight_scale = e8m0_to_fp32(
-            t2j(layer.w13_weight_scale, use_dlpack=False))
+        w13_weight = t2j(layer.w13_weight, use_dlpack=False)
+        w13_weight_scale = t2j(layer.w13_weight_scale, use_dlpack=False)
         w13_bias = t2j(layer.w13_bias, use_dlpack=False)
-        w2_weight = u8_unpack_e2m1(t2j(layer.w2_weight, use_dlpack=False))
-        w2_weight_scale = e8m0_to_fp32(
-            t2j(layer.w2_weight_scale, use_dlpack=False))
+        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
+        w2_weight_scale = t2j(layer.w2_weight_scale, use_dlpack=False)
         w2_bias = t2j(layer.w2_bias, use_dlpack=False)
-        # We dequantize fp4 weights into bf16.
-        # TODO(kyuyeunk): Add native support for MXFP4.
-        w13_weight = dequantize_block_weight(w13_weight, w13_weight_scale,
-                                             MXFP4_BLOCK_SIZE, jnp.bfloat16)
-        w2_weight = dequantize_block_weight(w2_weight, w2_weight_scale,
-                                            MXFP4_BLOCK_SIZE, jnp.bfloat16)
-        # Because we have dequantized weights, scales are not used anymore.
-        delattr(layer, "w13_weight_scale")
-        delattr(layer, "w2_weight_scale")
-        if layer.activation == "swigluoai":
-            # When using swigluoai, vLLM splits gmm output in a interleaved way.
-            # However, interleaved split is not performant on TPU. Therefore,
-            # we preprocess the weight so that splitting gmm output by middle
-            # can still get the same result.
-            w1_weight = w13_weight[:, ::2, :]
-            w3_weight = w13_weight[:, 1::2, :]
-            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
-            w1_bias = w13_bias[:, ::2]
-            w3_bias = w13_bias[:, 1::2]
-            w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
-        if layer.use_ep:
-            w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
-        else:
-            intermediate_size = w13_weight.shape[1] // 2
-            assert intermediate_size == w2_weight.shape[-1]
-            output_sizes = [intermediate_size, intermediate_size]
-            n_shards = self.mesh.shape["model"]
-            assert intermediate_size % n_shards == 0
-            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
-                                                                  output_sizes,
-                                                                  n_shards,
-                                                                  dim=1)
-            w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, "model", None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, None, "model"))))
-            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
-                                                                output_sizes,
-                                                                n_shards,
-                                                                dim=1)
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P(None, "model"))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
-                                                                  None))))
+        @jax.jit
+        def process_mxfp4_moe_weights(
+            w13_weight: jax.Array,
+            w13_weight_scale: jax.Array,
+            w13_bias: jax.Array,
+            w2_weight: jax.Array,
+            w2_weight_scale: jax.Array,
+            w2_bias: jax.Array,
+        ) -> FusedMoEWeights:
+            # Dequantize fp4 weights into fp32.
+            w13_weight = dequantize_tensor_from_mxfp4_packed(
+                w13_weight, w13_weight_scale, 2)
+            w2_weight = dequantize_tensor_from_mxfp4_packed(
+                w2_weight, w2_weight_scale, 2)
+            w13_interleave = layer.activation == "swigluoai"
+            w13_reorder_size = get_mesh_shape_product(
+                self.mesh, ShardingAxisName.MLP_TENSOR)
+            weights = quantize_moe_weights(
+                FusedMoEWeights(
+                    w13_weight=w13_weight,
+                    w13_weight_scale=None,
+                    w13_bias=w13_bias,
+                    w2_weight=w2_weight,
+                    w2_weight_scale=None,
+                    w2_bias=w2_bias,
+                ),
+                jnp.float4_e2m1fn,
+                REQUANTIZED_BLOCK_SIZE,
+            )
+            return process_moe_weights(
+                weights,
+                moe_backend=self.moe_backend,
+                w13_reorder_size=w13_reorder_size,
+                w13_interleave=w13_interleave,
+            )
+        weights = process_mxfp4_moe_weights(
+            w13_weight,
+            w13_weight_scale,
+            w13_bias,
+            w2_weight,
+            w2_weight_scale,
+            w2_bias,
+        )
+        weights = torch_view(
+            shard_moe_weights(weights, self.moe_backend, self.mesh))
-        layer.w13_weight = Parameter(torch_view(w13_weight),
-                                     requires_grad=False)
-        layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
+        layer.w13_weight = Parameter(weights.w13_weight, requires_grad=False)
+        layer.w2_weight = Parameter(weights.w2_weight, requires_grad=False)
-        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
-        layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
+        layer.w13_weight_scale = Parameter(weights.w13_weight_scale,
+                                           requires_grad=False)
+        layer.w2_weight_scale = Parameter(weights.w2_weight_scale,
+                                          requires_grad=False)
-        pass
+        layer.w13_bias = Parameter(weights.w13_bias, requires_grad=False)
+        layer.w2_bias = Parameter(weights.w2_bias, requires_grad=False)
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert isinstance(layer, FusedMoE)
-        if scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax is supported for scoring_func")
-        # Use the original implementation
-        output = fused_moe_func_padded(
-            jax_view(x),
-            jax_view(layer.w13_weight),
-            jax_view(layer.w2_weight),
-            jax_view(layer.w13_bias) if self.moe.has_bias else None,
-            jax_view(layer.w2_bias) if self.moe.has_bias else None,
-            jax_view(router_logits),
-            topk=top_k,
-            global_num_experts=global_num_experts,
-            renormalize=renormalize,
-            reduce_results=layer.reduce_results,
-            mesh=self.mesh,
-            use_ep=layer.use_ep,
-            activation=activation,
+    ) -> torch.Tensor:
+        return fused_moe_apply(
+            layer,
+            x,
+            router_logits,
+            self.moe_backend,
+            self.mesh,
+            self.extra_backend_kwargs,
         )
-        return torch_view(output)

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl