PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (179) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_dp_scheduler.py +899 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/fused_moe_v1_test.py +105 -0
tests/kernels/mla_v1_test.py +396 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +549 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/conftest.py +32 -0
tests/lora/test_bgmv.py +43 -0
tests/lora/test_layers.py +654 -0
tests/lora/test_lora.py +133 -0
tests/lora/utils.py +96 -0
tests/test_base.py +201 -0
tests/test_envs.py +182 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +236 -0
tpu_inference/__init__.py +34 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/core/sched/__init__.py +0 -0
tpu_inference/core/sched/dp_scheduler.py +523 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/jax_parallel_state.py +67 -0
tpu_inference/distributed/tpu_connector.py +728 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +107 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +362 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1035 -0
tpu_inference/kernels/mla/__init__.py +0 -0
tpu_inference/kernels/mla/v1/__init__.py +0 -0
tpu_inference/kernels/mla/v1/kernel.py +1349 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1478 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4147 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +367 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +51 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_interface.py +390 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/common/sharding.py +582 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +255 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +262 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +185 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +280 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +96 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +76 -0
tpu_inference/layers/jax/transformer_block.py +107 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +507 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +39 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +120 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +203 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +386 -0
tpu_inference/layers/vllm/sharding.py +230 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +311 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +444 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/gpt_oss.py +492 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +79 -0
tpu_inference/models/jax/llama3.py +375 -0
tpu_inference/models/jax/llama4.py +629 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +1103 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +163 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +105 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +653 -0
tpu_inference/models/jax/utils/weight_utils.py +529 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +286 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_platform.py +269 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +780 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +132 -0
tpu_inference/runner/kv_cache_manager.py +479 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +217 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +248 -0
tpu_inference/runner/structured_decoding_manager.py +88 -0
tpu_inference/runner/tpu_runner.py +1620 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +367 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +317 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/tpu_worker.py +321 -0
tpu_inference-0.11.1.dev202511150811.dist-info/METADATA +107 -0
tpu_inference-0.11.1.dev202511150811.dist-info/RECORD +179 -0
tpu_inference-0.11.1.dev202511150811.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dev202511150811.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dev202511150811.dist-info/top_level.txt +2 -0

tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py ADDED Viewed

@@ -0,0 +1,208 @@
+from typing import Optional
+import jax
+import jax.numpy as jnp
+import torch
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy)
+from jax.sharding import NamedSharding, PartitionSpec
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import \
+    CompressedTensorsW8A8Fp8
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import \
+    per_tensor_dequantize
+from tpu_inference.layers.vllm.linear_common import (
+    sharded_quantized_matmul, slice_sharded_tensor_for_concatenation,
+    torch_to_jax_param)
+from tpu_inference.layers.vllm.quantization.common import JaxCommonLinearConfig
+P = PartitionSpec
+def requantize_with_max_scale(
+        weight: torch.Tensor, weight_scale: torch.Tensor,
+        logical_widths: list[int]) -> tuple[torch.Tensor, torch.Tensor]:
+    dtype = weight.dtype
+    dtype_info = torch.finfo(dtype)
+    maxval = float(dtype_info.max)
+    minval = float(dtype_info.min)
+    max_w_scale = weight_scale.max()
+    unfused_module_in_checkpoint = (weight_scale[-1]
+                                    > torch.finfo(torch.float8_e4m3fn).min)
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            # Skip any component with zero width.
+            if logical_width == 0:
+                continue
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :],
+                                              weight_scale[idx])
+            weight_q = weight_dq / max_w_scale
+            weight[start:end, :] = weight_q.clamp(minval, maxval).to(dtype)
+            start = end
+    return max_w_scale, weight
+class VllmCompressedTensorsW8A8Fp8(CompressedTensorsW8A8Fp8):
+    def __init__(
+        self,
+        weight_quant: QuantizationArgs,
+        is_static_input_scheme: bool,
+        jax_config: JaxCommonLinearConfig,
+    ):
+        super().__init__(weight_quant, is_static_input_scheme)
+        self.jax_config = jax_config
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = layer.weight
+        weight_scale = layer.weight_scale
+        if self.is_static_input_scheme:
+            # In static quant, all input_scales share the same value.
+            assert layer.input_scale.min() == layer.input_scale.max()
+            input_scale_first = layer.input_scale[0]
+            input_scale = jax.device_put(
+                t2j(input_scale_first, use_dlpack=False),
+                NamedSharding(self.jax_config.mesh, P()))
+            input_scale = torch.nn.Parameter(torch_view(input_scale),
+                                             requires_grad=False)
+            delattr(layer, "input_scale")
+            layer.input_scale = input_scale
+            # TODO(kyuyeunk): Investigate performance gain from merging scales.
+            # By merging input and weight scales, we reduce the number of muls
+            # required for dequantization from 2 (for each scales) to 1.
+            # weight_scale *= input_scale_first
+        if self.strategy == QuantizationStrategy.TENSOR:
+            weight_scale, weight = requantize_with_max_scale(
+                weight, weight_scale, self.jax_config.output_sizes)
+            weight_scale = jax.device_put(
+                t2j(weight_scale, use_dlpack=False),
+                NamedSharding(self.jax_config.mesh, P()))
+            weight_scale = torch.nn.Parameter(torch_view(weight_scale),
+                                              requires_grad=False)
+        else:
+            weight_scale = weight_scale.squeeze(-1)
+            weight_scale = torch_to_jax_param(
+                weight_scale,
+                NamedSharding(self.jax_config.mesh,
+                              self.jax_config.bias_sharding),
+                self.jax_config.output_sizes, self.jax_config.n_shards,
+                self.jax_config.fuse_matmuls)
+        delattr(layer, "weight_scale")
+        layer.weight_scale = weight_scale
+        weight = torch_to_jax_param(
+            layer.weight,
+            NamedSharding(self.jax_config.mesh,
+                          self.jax_config.weight_sharding),
+            self.jax_config.output_sizes, self.jax_config.n_shards,
+            self.jax_config.fuse_matmuls)
+        delattr(layer, "weight")
+        layer.weight = weight
+        if layer.bias is not None:
+            bias = torch_to_jax_param(
+                layer.bias,
+                NamedSharding(self.jax_config.mesh,
+                              self.jax_config.bias_sharding),
+                self.jax_config.output_sizes, self.jax_config.n_shards,
+                self.jax_config.fuse_matmuls)
+            delattr(layer, "bias")
+            layer.bias = bias
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        with jax.named_scope(layer._get_name()):
+            if self.jax_config.fuse_matmuls:
+                return self._apply_fused(layer, x, bias)
+            else:
+                return self._apply_split(layer, x, bias)
+    def _apply_fused(self, layer: torch.nn.Module, x: torch.Tensor,
+                     bias: Optional[torch.Tensor]) -> torch.Tensor:
+        x_jax = jax_view(x)
+        weight_jax = jax_view(layer.weight)
+        weight_scale_jax = jax_view(layer.weight_scale)
+        if self.is_static_input_scheme:
+            # TODO(kyuyeunk): Add kernel support for static quant
+            input_scale = jax_view(layer.input_scale)
+            dtype_info = jnp.finfo(weight_jax.dtype)
+            maxval = float(dtype_info.max)
+            minval = float(dtype_info.min)
+            x_q = jnp.clip(x_jax / input_scale.astype(x_jax.dtype), minval,
+                           maxval).astype(weight_jax.dtype)
+            outs = jax.lax.dot_general(
+                x_q,
+                weight_jax,
+                (((1, ), (1, )), ((), ())),
+                preferred_element_type=jnp.float32,
+            )
+            outs *= weight_scale_jax
+            outs = outs.astype(x_jax.dtype)
+        else:
+            outs = sharded_quantized_matmul(x_jax, weight_jax,
+                                            weight_scale_jax,
+                                            self.jax_config.mesh,
+                                            self.jax_config.weight_sharding)
+        if bias is not None and not layer.skip_bias_add:
+            outs += jax_view(bias)
+        outs = slice_sharded_tensor_for_concatenation(
+            outs, self.jax_config.output_sizes, self.jax_config.n_shards)
+        return torch_view(jnp.concatenate(outs, axis=-1))
+    def _apply_split(self, layer: torch.nn.Module, x: torch.Tensor,
+                     bias: Optional[torch.Tensor]) -> torch.Tensor:
+        assert isinstance(layer.weight, torch.nn.ParameterList)
+        x_jax = jax_view(x)
+        outs = []
+        for i, (weight, weight_scale) in enumerate(
+                zip(layer.weight, layer.weight_scale)):
+            weight_jax = jax_view(weight)
+            weight_scale_jax = jax_view(weight_scale)
+            if self.is_static_input_scheme:
+                # TODO(kyuyeunk): Add kernel support for static quant
+                input_scale = jax_view(layer.input_scale)
+                dtype_info = jnp.finfo(weight_jax.dtype)
+                maxval = float(dtype_info.max)
+                minval = float(dtype_info.min)
+                x_q = jnp.clip(x_jax / input_scale.astype(x_jax.dtype), minval,
+                               maxval).astype(weight_jax.dtype)
+                out = jax.lax.dot_general(
+                    x_q,
+                    weight_jax,
+                    (((1, ), (1, )), ((), ())),
+                    preferred_element_type=jnp.float32,
+                )
+                # TODO(kyuyeunk): Investigate performance gain from merging scales.
+                # out *= weight_scale_jax
+                out *= weight_scale_jax * input_scale
+                out = out.astype(x_jax.dtype)
+            else:
+                out = sharded_quantized_matmul(x_jax, weight_jax,
+                                               weight_scale_jax,
+                                               self.jax_config.mesh,
+                                               self.jax_config.weight_sharding)
+            if bias is not None and not layer.skip_bias_add:
+                out += jax_view(bias[i])
+            outs.append(out)
+        return torch_view(jnp.concatenate(outs, axis=-1))

tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py ADDED Viewed

@@ -0,0 +1,136 @@
+from typing import Optional
+import jax
+import jax.numpy as jnp
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from jax.sharding import NamedSharding, PartitionSpec
+from torchax.interop import jax_view, torch_view
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import \
+    CompressedTensorsW8A8Int8
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import \
+    convert_to_channelwise
+from tpu_inference.layers.vllm.linear_common import (
+    sharded_quantized_matmul, slice_sharded_tensor_for_concatenation,
+    torch_to_jax_param)
+from tpu_inference.layers.vllm.quantization.common import JaxCommonLinearConfig
+P = PartitionSpec
+logger = init_logger(__name__)
+class VllmCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8):
+    def __init__(self, strategy: str, is_static_input_scheme: bool,
+                 input_symmetric: bool, jax_config: JaxCommonLinearConfig):
+        super().__init__(strategy, is_static_input_scheme, input_symmetric)
+        self.jax_config = jax_config
+        self.is_channelwise = (self.strategy == QuantizationStrategy.CHANNEL),
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = torch_to_jax_param(
+            layer.weight,
+            NamedSharding(self.jax_config.mesh,
+                          self.jax_config.weight_sharding),
+            self.jax_config.output_sizes,
+            self.jax_config.n_shards,
+            self.jax_config.fuse_matmuls,
+        )
+        delattr(layer, "weight")
+        layer.weight = weight
+        weight_scale = layer.weight_scale
+        is_fused_module = len(layer.logical_widths) > 1
+        if is_fused_module and not self.is_channelwise:
+            weight_scale = convert_to_channelwise(weight_scale,
+                                                  layer.logical_widths)
+        weight_scale = weight_scale.squeeze(-1)
+        weight_scale = torch_to_jax_param(
+            weight_scale,
+            NamedSharding(self.jax_config.mesh, self.jax_config.bias_sharding),
+            self.jax_config.output_sizes,
+            self.jax_config.n_shards,
+            self.jax_config.fuse_matmuls,
+        )
+        delattr(layer, "weight_scale")
+        layer.weight_scale = weight_scale
+        if layer.bias is not None and not layer.skip_bias_add:
+            if layer.return_bias:
+                logger.warning_once("Bias might return incorrect value.")
+            bias = torch_to_jax_param(
+                layer.bias,
+                NamedSharding(self.jax_config.mesh,
+                              self.jax_config.bias_sharding),
+                self.jax_config.output_sizes,
+                self.jax_config.n_shards,
+                self.jax_config.fuse_matmuls,
+            )
+            delattr(layer, "bias")
+            layer.bias = bias
+        # TODO(kyuyeunk): Support static range input quantization.
+        assert getattr(layer, "input_scale", None) is None
+        assert getattr(layer, "input_zero_point", None) is None
+        assert getattr(layer, "azp_adj", None) is None
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        with jax.named_scope(layer._get_name()):
+            if self.jax_config.fuse_matmuls:
+                out = self._apply_fused(layer, x, bias)
+            else:
+                out = self._apply_split(layer, x, bias)
+        return out
+    def _apply_fused(self, layer: torch.nn.Module, x: torch.Tensor,
+                     bias: Optional[torch.Tensor]) -> torch.Tensor:
+        x_jax = jax_view(x)
+        weight_jax = jax_view(layer.weight)
+        weight_scale_jax = jax_view(layer.weight_scale)
+        outs = sharded_quantized_matmul(
+            x_jax,
+            weight_jax,
+            weight_scale_jax,
+            self.jax_config.mesh,
+            self.jax_config.weight_sharding,
+        )
+        if bias is not None and not layer.skip_bias_add:
+            outs += jax_view(bias)
+        outs = slice_sharded_tensor_for_concatenation(
+            outs, self.jax_config.output_sizes, self.jax_config.n_shards)
+        out = jnp.concatenate(outs, axis=-1)
+        return torch_view(out)
+    def _apply_split(self, layer: torch.nn.Module, x: torch.Tensor,
+                     bias: Optional[torch.Tensor]) -> torch.Tensor:
+        assert isinstance(layer.weight, torch.nn.ParameterList)
+        x_jax = jax_view(x)
+        outs = []
+        for i, (weight, weight_scale) in enumerate(
+                zip(layer.weight, layer.weight_scale)):
+            weight_jax = jax_view(weight)
+            weight_scale_jax = jax_view(weight_scale)
+            out = sharded_quantized_matmul(
+                x_jax,
+                weight_jax,
+                weight_scale_jax,
+                self.jax_config.mesh,
+                self.jax_config.weight_sharding,
+            )
+            if bias is not None and not layer.skip_bias_add:
+                out += jax_view(bias[i])
+            outs.append(out)
+        out = jnp.concatenate(outs, axis=-1)
+        return torch_view(out)

tpu_inference/layers/vllm/quantization/mxfp4.py ADDED Viewed

@@ -0,0 +1,266 @@
+from typing import Callable, Optional, Union
+import jax
+import jax.numpy as jnp
+import torch
+from jax.experimental.layout import Format, Layout
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torch.nn.parameter import Parameter
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig, FusedMoEQuantConfig, biased_moe_quant_config)
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizeMethodBase
+from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
+                                                           Mxfp4Config,
+                                                           Mxfp4MoEMethod)
+from vllm.model_executor.layers.quantization.utils.quant_utils import \
+    is_layer_skipped
+from tpu_inference.layers.common.quant_methods import (MXFP4,
+                                                       get_tpu_quant_method)
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
+from tpu_inference.layers.vllm.linear_common import \
+    reorder_concatenated_tensor_for_sharding
+from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedLinearMethod
+MXFP4_BLOCK_SIZE = 32
+P = PartitionSpec
+logger = init_logger(__name__)
+# TODO(kyuyeunk): Move these functions into a common utility file.
+def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
+    assert u8_packed_e2m1.dtype == jnp.uint8
+    e2m1 = jax.lax.bitcast_convert_type(u8_packed_e2m1, jnp.float4_e2m1fn)
+    # bitcast creates one more dimension that splits 8 bits into two e2m1.
+    # we flatten them with the last dim.
+    return jnp.reshape(e2m1, e2m1.shape[:-2] + (-1, ))
+def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
+    e8_finfo = jnp.finfo(jnp.float8_e8m0fnu)
+    exponents = u8.astype(jnp.int32) + e8_finfo.minexp
+    ones = jnp.ones_like(u8, dtype=jnp.float32)
+    return jnp.ldexp(ones, exponents)
+def dequantize_block_weight(weight: jax.Array,
+                            scale: jax.Array,
+                            block_size: int,
+                            out_dtype: jnp.dtype = jnp.bfloat16) -> jax.Array:
+    orig_shape = weight.shape
+    weight_block = weight.reshape(orig_shape[:-1] + (-1, block_size))
+    weight_dequantized = weight_block.astype(jnp.float32) * jnp.expand_dims(
+        scale, -1)
+    return weight_dequantized.reshape(orig_shape).astype(out_dtype)
+@register_quantization_config(get_tpu_quant_method(MXFP4))
+class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
+    @classmethod
+    def get_name(cls):
+        return MXFP4
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            linear_config = self.get_linear_config(layer)
+            if self.ignored_layers and is_layer_skipped(
+                    prefix=prefix,
+                    ignored_layers=self.ignored_layers,
+                    fused_mapping=self.packed_modules_mapping,
+            ):
+                return VllmUnquantizedLinearMethod(linear_config)
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that
+            # implementation if you are interested in enabling MXFP4 here.
+            logger.warning_once(
+                "MXFP4 linear layer is not implemented - falling back to "
+                "UnquantizedLinearMethod.")
+            return VllmUnquantizedLinearMethod(linear_config)
+        elif isinstance(layer, FusedMoE):
+            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+        elif isinstance(layer, Attention):
+            # TODO: Add support for MXFP4 Attention.
+            logger.warning_once("MXFP4 attention layer is not implemented. "
+                                "Skipping quantization for this layer.")
+        return None
+class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
+    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+        FusedMoEMethodBase.__init__(self, moe)
+        # We piggyback on triton implementation as it applies minimal hardware
+        # specific post processing to the weights.
+        self.mxfp4_backend = Mxfp4Backend.TRITON
+        self.mesh = mesh
+    def get_fused_moe_quant_config(
+            self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
+        # Because we have dequantized weights, we only need biased moe config.
+        # TODO(kyuyeunk): Add native support for MXFP4.
+        return biased_moe_quant_config(
+            layer.w13_bias,
+            layer.w2_bias,
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        assert isinstance(layer, FusedMoE)
+        w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
+        w13_weight_scale = e8m0_to_fp32(
+            t2j(layer.w13_weight_scale, use_dlpack=False))
+        w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+        w2_weight = u8_unpack_e2m1(t2j(layer.w2_weight, use_dlpack=False))
+        w2_weight_scale = e8m0_to_fp32(
+            t2j(layer.w2_weight_scale, use_dlpack=False))
+        w2_bias = t2j(layer.w2_bias, use_dlpack=False)
+        # We dequantize fp4 weights into bf16.
+        # TODO(kyuyeunk): Add native support for MXFP4.
+        w13_weight = dequantize_block_weight(w13_weight, w13_weight_scale,
+                                             MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        w2_weight = dequantize_block_weight(w2_weight, w2_weight_scale,
+                                            MXFP4_BLOCK_SIZE, jnp.bfloat16)
+        # Because we have dequantized weights, scales are not used anymore.
+        delattr(layer, "w13_weight_scale")
+        delattr(layer, "w2_weight_scale")
+        if layer.activation == "swigluoai":
+            # When using swigluoai, vLLM splits gmm output in a interleaved way.
+            # However, interleaved split is not performant on TPU. Therefore,
+            # we preprocess the weight so that splitting gmm output by middle
+            # can still get the same result.
+            w1_weight = w13_weight[:, ::2, :]
+            w3_weight = w13_weight[:, 1::2, :]
+            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+            w1_bias = w13_bias[:, ::2]
+            w3_bias = w13_bias[:, 1::2]
+            w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
+        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
+        if layer.use_ep:
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+        else:
+            intermediate_size = w13_weight.shape[1] // 2
+            assert intermediate_size == w2_weight.shape[-1]
+            output_sizes = [intermediate_size, intermediate_size]
+            n_shards = self.mesh.shape["model"]
+            assert intermediate_size % n_shards == 0
+            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
+                                                                  output_sizes,
+                                                                  n_shards,
+                                                                  dim=1)
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, "model", None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, None, "model"))))
+            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
+                                                                output_sizes,
+                                                                n_shards,
+                                                                dim=1)
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P(None, "model"))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
+                                                                  None))))
+        layer.w13_weight = Parameter(torch_view(w13_weight),
+                                     requires_grad=False)
+        layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
+        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
+        layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
+        pass
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert isinstance(layer, FusedMoE)
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax is supported for scoring_func")
+        # Use the original implementation
+        output = fused_moe_func_padded(
+            jax_view(x),
+            jax_view(layer.w13_weight),
+            jax_view(layer.w2_weight),
+            jax_view(layer.w13_bias) if self.moe.has_bias else None,
+            jax_view(layer.w2_bias) if self.moe.has_bias else None,
+            jax_view(router_logits),
+            topk=top_k,
+            global_num_experts=global_num_experts,
+            renormalize=renormalize,
+            reduce_results=layer.reduce_results,
+            mesh=self.mesh,
+            use_ep=layer.use_ep,
+            activation=activation,
+        )
+        return torch_view(output)