PyPI - tpu-inference - Versions diffs - 0.13.2.dev20260104__tar.gz → 0.13.2rc1__tar.gz - Mend

tpu-inference 0.13.2.dev20260104tar.gz → 0.13.2rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (279) hide show

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/MANIFEST.in RENAMED Viewed

@@ -1,3 +1,4 @@
 include requirements.txt
+include requirements_v7x.txt
 include README.md
 include tpu_inference/models/jax/utils/quantization/configs/*.yaml

{tpu_inference-0.13.2.dev20260104/tpu_inference.egg-info → tpu_inference-0.13.2rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tpu_inference
-Version: 0.13.2.dev20260104
+Version: 0.13.2rc1
 Author: tpu_inference Contributors
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers

tpu_inference-0.13.2rc1/requirements_v7x.txt ADDED Viewed

@@ -0,0 +1,25 @@
+# This file contains additional dependencies needed for TPU v7x support.
+# It is expected to be used in conjunction with the main requirements.txt file.
+--pre
+-i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/
+-f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+jax==0.8.1
+jaxlib==0.8.1
+jaxtyping==0.3.2
+libtpu==0.0.31
+tpu-info==0.7.1
+yapf==0.43.0
+pytest
+pytest-mock
+absl-py
+numpy
+google-cloud-storage
+flax==0.11.1
+torchax==0.0.10
+qwix==0.1.1
+torchvision==0.24.0
+pathwaysutils
+parameterized
+numba==0.62.1
+runai-model-streamer[s3,gcs]==0.15.0

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/setup.py RENAMED Viewed

@@ -20,26 +20,40 @@ def get_requirements() -> List[str]:
             requirements = f.read().strip().split("\n")
         resolved_requirements = []
         for line in requirements:
+            if not line or line.startswith("#"):
+                continue
             if line.startswith("-r "):
                 resolved_requirements += _read_requirements(line.split()[1])
-            elif line.startswith("--"):
+            elif line.startswith(("-", "--")):
                 continue
             else:
                 resolved_requirements.append(line)
         return resolved_requirements
     try:
-        requirements = _read_requirements("requirements.txt")
+        #requirements = _read_requirements("requirements_v7x.txt")
+        # For TPU v7x build
+        if os.getenv("IS_FOR_V7X", "false").lower() == "true":
+            print("Using requirements_v7x.txt")
+            requirements = _read_requirements("requirements_v7x.txt")
+            #requirements.extend(v7x_requirements)
+        else:
+            #For TPU v6e build
+            print("Using requirements.txt")
+            requirements = _read_requirements("requirements.txt")
     except ValueError:
         print("Failed to read requirements.txt in vllm_tpu.")
     return requirements
 def get_version():
-    if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
-        return env_version
-    return "0.0.0"
+    version = os.getenv("VLLM_VERSION_OVERRIDE", "0.0.0").strip()
+    if os.getenv("IS_FOR_V7X", "false").lower() == "true":
+        version = f"{version}.post7"
+    return version
 setup(
     name="tpu_inference",

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/e2e/test_speculative_decoding.py RENAMED Viewed

@@ -271,7 +271,7 @@ def test_ngram_performance_random(
             "prompt_lookup_max": 2,
             "prompt_lookup_min": 2,
             "num_speculative_tokens": 4,
-        }, 1.2 if _is_v7x() else 3.0)
+        }, 1.5 if _is_v7x() else 3.0)
 def test_eagle3_correctness(
@@ -308,4 +308,4 @@ def test_eagle3_performance(
             "model": "unkmaster/EAGLE3-LLaMA3.1-Instruct-8B",
             "num_speculative_tokens": 2,
             "draft_tensor_parallel_size": 1
-        }, 0.6 if _is_v7x() else 1.8)
+        }, 1.2 if _is_v7x() else 1.8)

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/jax/test_qwix.py RENAMED Viewed

@@ -832,7 +832,7 @@ class TestGetDefaultQwixQuantizationConfig(unittest.TestCase):
         # Patch the constants in the module where the function resides
         self.patchers = [
             patch(
-                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG",
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_DEEPSEEK_FP8_CONFIG",
                 self.mock_deepseek_config),
             patch(
                 "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_LLAMA4_FP8_CONFIG",

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/vllm/test_awq.py RENAMED Viewed

@@ -39,8 +39,7 @@ from vllm.scalar_type import scalar_types
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
 from tpu_inference.layers.vllm.quantization.awq import (VllmAWQConfig,
                                                         VllmAWQLinearMethod)
-from tpu_inference.layers.vllm.quantization.configs import \
-    VllmQuantLinearConfig
+from tpu_inference.layers.vllm.quantization.common import JaxCommonLinearConfig
 from . import utils as test_utils
@@ -104,8 +103,8 @@ def return_ref_and_layer_output(
     assert isinstance(quant_method, VllmAWQLinearMethod)
     quant_config = quant_method.quant_config
     assert isinstance(quant_config, VllmAWQConfig)
-    jax_config = quant_method.linear_config
-    assert isinstance(jax_config, VllmQuantLinearConfig)
+    jax_config = quant_method.jax_config
+    assert isinstance(jax_config, JaxCommonLinearConfig)
     input_tensor = torch.rand(
         batch_size, layer.input_size, dtype=torch.bfloat16) / 10
@@ -135,8 +134,8 @@ def initialize_and_return_layer_weights(layer: torch.nn.Module):
     assert isinstance(quant_method, VllmAWQLinearMethod)
     quant_config = quant_method.quant_config
     assert isinstance(quant_config, VllmAWQConfig)
-    jax_config = quant_method.linear_config
-    assert isinstance(jax_config, VllmQuantLinearConfig)
+    jax_config = quant_method.jax_config
+    assert isinstance(jax_config, JaxCommonLinearConfig)
     # torch.rand returns value in the range of [0, 1). We subtract by 0.2 to
     # simulate asymmetry

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/vllm/test_compressed_tensors_moe.py RENAMED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import tempfile
 import jax.numpy as jnp
@@ -42,6 +43,8 @@ from . import utils as test_utils
 P = PartitionSpec
+os.environ['VLLM_DISABLE_SHARED_EXPERTS_STREAM'] = '1'
 MODEL = 'BCCard/Qwen3-30B-A3B-FP8-Dynamic'

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py RENAMED Viewed

@@ -16,7 +16,6 @@ import tempfile
 from typing import Optional
 import jax
-import jax.numpy as jnp
 import pytest
 import torch
 import torchax
@@ -37,15 +36,12 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
     CompressedTensorsLinearMethod
 from vllm.model_executor.model_loader import get_model as vllm_get_model
-from tpu_inference.layers.common.quantization import (dequantize_tensor,
-                                                      quantize_tensor)
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
+from tpu_inference.layers.vllm.quantization.common import JaxCommonLinearConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
     VllmCompressedTensorsConfig
-from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import \
-    VllmCompressedTensorsW8A8Fp8
-from tpu_inference.layers.vllm.quantization.configs import \
-    VllmQuantLinearConfig
+from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import (
+    VllmCompressedTensorsW8A8Fp8, requantize_with_max_scale)
 from . import utils as test_utils
@@ -102,8 +98,8 @@ def return_ref_and_layer_output(layer: torch.nn.Module, batch_size: int = 16):
     assert isinstance(layer, LinearBase)
     scheme = layer.scheme
     assert isinstance(scheme, VllmCompressedTensorsW8A8Fp8)
-    quant_config = scheme.linear_config
-    assert isinstance(quant_config, VllmQuantLinearConfig)
+    quant_config = scheme.jax_config
+    assert isinstance(quant_config, JaxCommonLinearConfig)
     quant_method = layer.quant_method
     assert isinstance(quant_method, CompressedTensorsLinearMethod)
     per_tensor = scheme.strategy == QuantizationStrategy.TENSOR
@@ -118,27 +114,8 @@ def return_ref_and_layer_output(layer: torch.nn.Module, batch_size: int = 16):
     # For per_tensor with merged layers, vLLM requenzites them so all merged
     # layers shared the same scale values.
     if per_tensor:
-        dtype = weight.dtype
-        weight = t2j(weight)
-        weight_scale = t2j(weight_scale)
-        weights = []
-        start = 0
-        # Multiple weights may have been concatenated. Loop through
-        # each weight and perform dequantization.
-        for i, output_size in enumerate(quant_config.output_sizes):
-            end = start + output_size
-            weights.append(
-                dequantize_tensor(weight[start:end], weight_scale[i]))
-            start = end
-        weight = jnp.concat(weights, axis=0)
-        weight, weight_scale = quantize_tensor(
-            jnp.float8_e4m3fn,
-            weight,
-            None,
-        )
-        weight = j2t(weight.astype(jnp.float32)).to(dtype)
-        weight_scale = j2t(weight_scale)
+        weight_scale, weight = requantize_with_max_scale(
+            layer.weight, layer.weight_scale, quant_config.output_sizes)
         if input_scale is not None:
             input_scale = input_scale.max()
@@ -174,8 +151,8 @@ def initialize_layer_weights(layer: torch.nn.Module):
     assert isinstance(layer, LinearBase)
     scheme = layer.scheme
     assert isinstance(scheme, VllmCompressedTensorsW8A8Fp8)
-    quant_config = scheme.linear_config
-    assert isinstance(quant_config, VllmQuantLinearConfig)
+    quant_config = scheme.jax_config
+    assert isinstance(quant_config, JaxCommonLinearConfig)
     per_tensor = scheme.strategy == QuantizationStrategy.TENSOR
     weight_list = []

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/vllm/test_compressed_tensors_w8a8_int8.py RENAMED Viewed

@@ -185,7 +185,7 @@ def test_row_parallel_linear(model, bias, num_devices, enable_sp,
     if bias:
         jax_row_linear.bias.data = bias_data
-    input_tensor = torch.rand(10, jax_row_linear.input_size, dtype=dtype) / 10
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
     input_tensor = input_tensor.to('cpu')
     jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
@@ -259,8 +259,7 @@ def test_column_parallel_linear(model, bias, num_devices, enable_sp,
     if bias:
         jax_column_linear.bias.data = bias_data
-    input_tensor = torch.rand(10, jax_column_linear.input_size,
-                              dtype=dtype) / 10
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
     input_tensor = input_tensor.to('cpu')
     jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
@@ -339,7 +338,7 @@ def test_qkv_parallel_linear(model, bias, num_devices, enable_sp, fuse_matmuls,
     if bias:
         jax_qkv_linear.bias.data = bias_data
-    input_tensor = torch.rand(10, jax_qkv_linear.input_size, dtype=dtype) / 10
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
     input_tensor = input_tensor.to('cpu')
     jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
@@ -415,8 +414,7 @@ def test_merged_column_parallel_linear(model, bias, num_devices, fuse_matmuls,
     if bias:
         jax_merged_column_linear.bias.data = bias_data
-    input_tensor = torch.rand(
-        10, jax_merged_column_linear.input_size, dtype=dtype) / 10
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
     input_tensor = input_tensor.to('cpu')
     jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/vllm/test_mxfp4.py RENAMED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import tempfile
-from unittest import mock
 import jax
 import jax.numpy as jnp
@@ -30,7 +29,6 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-from tpu_inference.layers.vllm.fused_moe import FusedMoEBackend
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
 from tpu_inference.layers.vllm.quantization.mxfp4 import (VllmMxfp4Config,
                                                           VllmMxfp4MoEMethod)
@@ -162,8 +160,6 @@ def test_mxfp4_fused_moe(num_devices, num_tokens, intermediate_size,
     )
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = dtype
-    vllm_config.parallel_config = ParallelConfig(
-        tensor_parallel_size=mesh.devices.size, enable_expert_parallel=use_ep)
     quant_config = get_tpu_quantization_config(vllm_config, mesh)
     with set_current_vllm_config(vllm_config):
@@ -194,16 +190,13 @@ def test_mxfp4_fused_moe(num_devices, num_tokens, intermediate_size,
     with torchax.default_env(), set_forward_context(None, vllm_config):
         assert isinstance(vllm_fused_moe.quant_method, VllmMxfp4MoEMethod)
-        if use_ep:
-            assert vllm_fused_moe.quant_method.moe_backend == FusedMoEBackend.GMM_EP
-        else:
-            assert vllm_fused_moe.quant_method.moe_backend == FusedMoEBackend.GMM_TP
         jax_a = a.to('jax')
         score = score.to('jax')
         vllm_fused_moe.quant_method.process_weights_after_loading(
             vllm_fused_moe)
         actual = vllm_fused_moe(jax_a, score)
         torch.testing.assert_close(expected,
@@ -220,7 +213,6 @@ def test_mxfp4_fused_moe(num_devices, num_tokens, intermediate_size,
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("topk", [2])
 @pytest.mark.parametrize("enable_attn_dp", [False, True])
-@mock.patch("os.environ", {"USE_MOE_EP_KERNEL": "1"})
 def test_mxfp4_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
                                     hidden_size, num_experts, topk,
                                     enable_attn_dp):
@@ -261,7 +253,7 @@ def test_mxfp4_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = dtype
     vllm_config.parallel_config = ParallelConfig(
-        tensor_parallel_size=mesh.devices.size, enable_expert_parallel=True)
+        tensor_parallel_size=mesh.devices.size)
     quant_config = get_tpu_quantization_config(vllm_config, mesh)
     with set_current_vllm_config(vllm_config):
@@ -293,14 +285,14 @@ def test_mxfp4_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
     with torchax.default_env(), set_forward_context(None, vllm_config):
         assert isinstance(vllm_fused_moe.quant_method, VllmMxfp4MoEMethod)
-        assert vllm_fused_moe.quant_method.moe_backend == FusedMoEBackend.FUSED_MOE
         jax_a = a.to('jax')
         score = score.to('jax')
+        vllm_fused_moe.quant_method.use_kernel = True
         vllm_fused_moe.quant_method.process_weights_after_loading(
             vllm_fused_moe)
-        vllm_fused_moe.quant_method.extra_backend_kwargs.update({
+        vllm_fused_moe.quant_method.block_size = {
             "bt": 32,
             "bf": 512,
             "bd1": 1024,
@@ -309,7 +301,7 @@ def test_mxfp4_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
             "bfc": 512,
             "bd1c": 1024,
             "bd2c": 1024,
-        })
+        }
         actual = vllm_fused_moe(jax_a, score)

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/layers/vllm/test_unquantized.py RENAMED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import tempfile
-from unittest import mock
 import jax
 import pytest
@@ -36,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.model_loader import get_model as vllm_get_model
-from tpu_inference.layers.vllm.fused_moe import FusedMoEBackend
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
 from tpu_inference.layers.vllm.quantization.unquantized import (
     VllmUnquantizedConfig, VllmUnquantizedFusedMoEMethod,
@@ -141,6 +139,9 @@ def test_row_parallel_linear(model, bias, num_devices, enable_sp,
     vllm_config = engine_args.create_engine_config()
     vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
     with set_current_vllm_config(vllm_config):
         row_linear = RowParallelLinear(
             input_size=4096,
@@ -150,9 +151,6 @@ def test_row_parallel_linear(model, bias, num_devices, enable_sp,
             return_bias=False,
         )
-    input_tensor = torch.rand(10, row_linear.input_size, dtype=dtype) / 10
-    input_tensor = input_tensor.to('cpu')
     weight_data = torch.rand_like(row_linear.weight.data) / 10
     if bias:
         bias_data = torch.rand_like(row_linear.bias.data)
@@ -218,6 +216,9 @@ def test_column_parallel_linear(model, bias, num_devices, enable_sp,
     vllm_config = engine_args.create_engine_config()
     vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
     with set_current_vllm_config(vllm_config):
         column_linear = ColumnParallelLinear(
             input_size=4096,
@@ -227,9 +228,6 @@ def test_column_parallel_linear(model, bias, num_devices, enable_sp,
             return_bias=False,
         )
-    input_tensor = torch.rand(10, column_linear.input_size, dtype=dtype) / 10
-    input_tensor = input_tensor.to('cpu')
     weight_data = torch.rand_like(column_linear.weight.data) / 10
     if bias:
         bias_data = torch.rand_like(column_linear.bias.data)
@@ -295,6 +293,9 @@ def test_qkv_parallel_linear(model, bias, num_devices, enable_sp, fuse_matmuls,
     vllm_config = engine_args.create_engine_config()
     vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
     with set_current_vllm_config(vllm_config):
         qkv_linear = QKVParallelLinear(
             hidden_size=4096,
@@ -306,9 +307,6 @@ def test_qkv_parallel_linear(model, bias, num_devices, enable_sp, fuse_matmuls,
             return_bias=False,
         )
-    input_tensor = torch.rand(10, qkv_linear.input_size, dtype=dtype) / 10
-    input_tensor = input_tensor.to('cpu')
     weight_data = torch.rand_like(qkv_linear.weight.data) / 10
     if bias:
         bias_data = torch.rand_like(qkv_linear.bias.data)
@@ -377,6 +375,9 @@ def test_merged_column_parallel_linear(model, bias, num_devices, fuse_matmuls,
     vllm_config = engine_args.create_engine_config()
     vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
     # Call vLLM code
     with set_current_vllm_config(vllm_config):
         merged_column_linear = MergedColumnParallelLinear(
@@ -387,10 +388,6 @@ def test_merged_column_parallel_linear(model, bias, num_devices, fuse_matmuls,
             return_bias=False,
         )
-    input_tensor = torch.rand(10, merged_column_linear.input_size,
-                              dtype=dtype) / 10
-    input_tensor = input_tensor.to('cpu')
     weight_data = torch.rand_like(merged_column_linear.weight.data) / 10
     if bias:
         bias_data = torch.rand_like(merged_column_linear.bias.data)
@@ -478,8 +475,6 @@ def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
     )
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = dtype
-    vllm_config.parallel_config = ParallelConfig(
-        tensor_parallel_size=mesh.devices.size, enable_expert_parallel=use_ep)
     quant_config = get_tpu_quantization_config(vllm_config, mesh)
     with set_current_vllm_config(vllm_config):
@@ -511,10 +506,6 @@ def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
     with torchax.default_env(), set_forward_context(None, vllm_config):
         assert isinstance(vllm_fused_moe.quant_method,
                           VllmUnquantizedFusedMoEMethod)
-        if use_ep:
-            assert vllm_fused_moe.quant_method.moe_backend == FusedMoEBackend.GMM_EP
-        else:
-            assert vllm_fused_moe.quant_method.moe_backend == FusedMoEBackend.GMM_TP
         jax_a = a.to('jax')
         score = score.to('jax')
@@ -538,7 +529,6 @@ def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
 @pytest.mark.parametrize("topk", [8])
 @pytest.mark.parametrize("has_bias", [False, True])
 @pytest.mark.parametrize("enable_attn_dp", [False, True])
-@mock.patch("os.environ", {"USE_MOE_EP_KERNEL": "1"})
 def test_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
                               hidden_size, num_experts, topk, has_bias,
                               enable_attn_dp):
@@ -602,7 +592,7 @@ def test_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
     vllm_config = engine_args.create_engine_config()
     vllm_config.model_config.dtype = dtype
     vllm_config.parallel_config = ParallelConfig(
-        tensor_parallel_size=mesh.devices.size, enable_expert_parallel=True)
+        tensor_parallel_size=mesh.devices.size)
     quant_config = get_tpu_quantization_config(vllm_config, mesh)
     with set_current_vllm_config(vllm_config):
@@ -619,6 +609,7 @@ def test_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
             has_bias=has_bias,
         )
         vllm_fused_moe.moe_parallel_config.use_ep = True
+        vllm_fused_moe.quant_method.use_kernel = True
     vllm_fused_moe.w13_weight.data = w1
     vllm_fused_moe.w2_weight.data = w2
@@ -634,14 +625,12 @@ def test_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
     with torchax.default_env(), set_forward_context(None, vllm_config):
         assert isinstance(vllm_fused_moe.quant_method,
                           VllmUnquantizedFusedMoEMethod)
-        assert vllm_fused_moe.quant_method.moe_backend == FusedMoEBackend.FUSED_MOE
         jax_a = a.to('jax')
         score = score.to('jax')
         vllm_fused_moe.quant_method.process_weights_after_loading(
             vllm_fused_moe)
-        vllm_fused_moe.quant_method.extra_backend_kwargs.update({
+        vllm_fused_moe.quant_method.block_size = {
             "bt": 32,
             "bf": 512,
             "bd1": 512,
@@ -650,7 +639,7 @@ def test_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
             "bfc": 256,
             "bd1c": 256,
             "bd2c": 256,
-        })
+        }
         actual = vllm_fused_moe(jax_a, score)
         torch.testing.assert_close(

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tests/lora/test_layers.py RENAMED Viewed

@@ -42,12 +42,10 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
 from vllm.model_executor.utils import set_random_seed
 from vllm.platforms import current_platform
-from tpu_inference.layers.vllm.process_weights.cleanup_sharding import \
-    _shard_module_to_tpu
-from tpu_inference.layers.vllm.quantization.configs import \
-    VllmQuantLinearConfig
+from tpu_inference.layers.vllm.quantization.common import JaxCommonLinearConfig
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedLinearMethod
+from tpu_inference.layers.vllm.sharding import _shard_module_to_tpu
 from .utils import DummyLoRAManager
@@ -631,7 +629,7 @@ def _create_lora_wrapper(linear,
                          mesh,
                          repeats=1):
     base_linear.weight.data = linear.weight.data
-    jax_config = VllmQuantLinearConfig(vllm_config, mesh, base_linear)
+    jax_config = JaxCommonLinearConfig(vllm_config, mesh, base_linear)
     linear_method = VllmUnquantizedLinearMethod(jax_config)
     base_linear.quant_method = linear_method
     linear_method.process_weights_after_loading(

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tpu_inference/executors/ray_distributed_executor.py RENAMED Viewed

@@ -20,7 +20,7 @@ import ray
 import vllm.envs as envs
 from ray.util.placement_group import PlacementGroup
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-from vllm.multimodal.inputs import MultiModalKwargsItem
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
@@ -53,7 +53,7 @@ logger = init_logger(__name__)
 def _encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types and MultiModalKwargsItem.
+    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -62,7 +62,7 @@ def _encode_hook(obj: Any) -> Any:
             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
             f"Given array has a type code of {obj.typecode}.")
         return obj.tobytes()
-    if isinstance(obj, MultiModalKwargsItem):
+    if isinstance(obj, MultiModalKwargs):
         return dict(obj)

{tpu_inference-0.13.2.dev20260104 → tpu_inference-0.13.2rc1}/tpu_inference/layers/common/quantization.py RENAMED Viewed

@@ -52,7 +52,7 @@ def quantize_tensor_to_mxfp4_packed(
 def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
-    """Unpack e2m1 tensor that was packed into u8."""
+    """Unpack e2m1 tensor packed into u8."""
     assert u8_packed_e2m1.dtype == jnp.uint8
     e2m1 = jax.lax.bitcast_convert_type(u8_packed_e2m1, jnp.float4_e2m1fn)
     # bitcast creates one more dimension that splits 8 bits into two e2m1.
@@ -61,7 +61,7 @@ def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
 def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
-    """Convert e8m0 (that was bitcasted to u8) into fp32."""
+    """Convert e8m0 (that was bitcasted to u8) into fp32"""
     assert u8.dtype == jnp.uint8
     e8_finfo = jnp.finfo(jnp.float8_e8m0fnu)
@@ -70,18 +70,6 @@ def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
     return jnp.ldexp(ones, exponents)
-def awq_u32_unpack_u4(awq_u32_packed: jax.Array) -> jax.Array:
-    """Unpack u4 tensor that was packed into u32 in awq ordering."""
-    awq_u4 = jax.lax.bitcast_convert_type(awq_u32_packed, jnp.uint4)
-    # AWQ packs 8 uint4 into 32-bits in this order: (0, 2, 4, 6, 1, 3, 5, 7).
-    # Following list maps the order used by AWQ into an ascending order.
-    reverse_awq_order = (0, 4, 1, 5, 2, 6, 3, 7)
-    u4 = awq_u4[..., reverse_awq_order]
-    return jnp.reshape(u4, u4.shape[:-2] + (-1, ))
 def dequantize_tensor(
     tensor_q: jax.Array,
     scale: jax.Array,

tpu_inference-0.13.2.dev20260104/tpu_inference/layers/common/fused_moe_gmm.py → tpu_inference-0.13.2rc1/tpu_inference/layers/vllm/fused_moe.py RENAMED Viewed

@@ -21,7 +21,7 @@ from jax.sharding import PartitionSpec as P
 from tpu_inference.kernels.megablox.gmm import gmm
 from tpu_inference.layers.common.sharding import ShardingAxisName
-from tpu_inference.layers.common.utils import \
+from tpu_inference.layers.vllm.linear_common import \
     slice_sharded_tensor_for_concatenation
 from tpu_inference.utils import get_mesh_shape_product

tpu-inference 0.13.2.dev20260104__tar.gz → 0.13.2rc1__tar.gz

Potentially problematic release.

tpu-inference 0.13.2.dev20260104tar.gz → 0.13.2rc1tar.gz