PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl → 0.12.0.dev20251224__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222py3-none-any.whl → 0.12.0.dev20251224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

tests/core/test_dp_scheduler.py +128 -71
tests/e2e/test_data_parallel.py +176 -280
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_speculative_decoding.py +26 -6
tests/layers/jax/test_qwix.py +1 -1
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +36 -21
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +36 -21
tests/layers/vllm/test_mxfp4.py +25 -10
tests/layers/vllm/test_unquantized.py +61 -31
tests/layers/vllm/utils.py +19 -4
tests/models/common/test_model_loader.py +2 -2
tests/models/jax/test_qwen2_5_vl.py +10 -11
tests/runner/test_multimodal_manager.py +3 -3
tests/runner/test_tpu_runner.py +67 -8
tests/runner/test_tpu_runner_dp.py +66 -0
tpu_inference/core/sched/dp_scheduler.py +65 -40
tpu_inference/kernels/mla/v1/kernel.py +7 -26
tpu_inference/layers/common/sharding.py +8 -3
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +3 -3
tpu_inference/layers/jax/attention/gpt_oss_attention.py +3 -3
tpu_inference/layers/jax/attention/llama4_attention.py +3 -4
tpu_inference/layers/jax/sample/sampling.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +51 -47
tpu_inference/layers/vllm/quantization/common.py +14 -13
tpu_inference/layers/vllm/quantization/mxfp4.py +21 -7
tpu_inference/layers/vllm/quantization/unquantized.py +19 -7
tpu_inference/layers/vllm/sharding.py +7 -4
tpu_inference/models/common/model_loader.py +11 -14
tpu_inference/models/jax/llama3.py +13 -10
tpu_inference/models/jax/llama_guard_4.py +1 -1
tpu_inference/models/jax/qwen2.py +3 -2
tpu_inference/models/jax/qwen2_5_vl.py +4 -4
tpu_inference/models/jax/utils/multi_modal_utils.py +4 -4
tpu_inference/models/jax/utils/qwix/qwix_utils.py +3 -3
tpu_inference/models/vllm/vllm_model_wrapper.py +5 -2
tpu_inference/platforms/tpu_platform.py +7 -7
tpu_inference/runner/compilation_manager.py +43 -33
tpu_inference/runner/kv_cache_manager.py +1 -2
tpu_inference/runner/multimodal_manager.py +1 -1
tpu_inference/runner/tpu_runner.py +12 -9
tpu_inference/utils.py +31 -30
tpu_inference/worker/tpu_worker.py +5 -2
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/METADATA +1 -1
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/RECORD +47 -46
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/top_level.txt +0 -0

tests/e2e/test_hybrid_kvcache.py ADDED Viewed

@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import time
+from dataclasses import asdict
+import pytest
+from vllm import LLM, EngineArgs, SamplingParams
+@pytest.fixture
+def model_name():
+    """Choose gemma-27b as the test model as it has both full attention and
+    sliding window attention."""
+    return "google/gemma-3-27b-it"
+@pytest.fixture
+def test_prompts():
+    """Simple test prompts for hybrid kv cache testing."""
+    return [
+        "Hello, my name is",
+        "The capital of France is",
+        "The colors of the rainbow are",
+        "The future of AI is",
+        "The president of the United States is",
+        "How many players are on a standard soccer team?",
+        "In Greek mythology, who is the god of the sea?",
+        "What is the capital of Australia?",
+        "What is the largest planet in our solar system?",
+        "Who developed the theory of general relativity?",
+    ]
+@pytest.fixture
+def sampling_params():
+    """Standard sampling parameters for testing."""
+    return SamplingParams(
+        temperature=0.0,
+        max_tokens=32,
+        ignore_eos=True,
+        logprobs=1,
+    )
+def _run_inference_with_config(
+        model_name: str,
+        test_prompts: list,
+        sampling_params: SamplingParams,
+        tensor_parallel_size: int = 4,
+        kv_cache_dtype: str = "auto",
+        enable_prefix_caching: bool = False,
+        disable_hybrid_kv_cache_manager: bool = False) -> list:
+    """Helper function to run inference with specified configuration."""
+    # Create LLM args using parser-based approach similar to offline_inference.py
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=64,
+        tensor_parallel_size=tensor_parallel_size,
+        gpu_memory_utilization=0.95,
+        max_num_batched_tokens=256,
+        max_num_seqs=16,
+        enable_prefix_caching=enable_prefix_caching,
+        kv_cache_dtype=kv_cache_dtype,
+        disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
+    )
+    engine_args_dict = asdict(engine_args)
+    llm = LLM(**engine_args_dict)
+    try:
+        outputs = llm.generate(test_prompts, sampling_params)
+        return outputs
+    finally:
+        del llm
+        # Wait for TPUs to be released
+        time.sleep(10)
+def test_hybrid_kv_cache(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test hybrid kv cache works on gemma vLLM models.
+    """
+    os.environ['MODEL_IMPL_TYPE'] = 'vllm'
+    # Test with hybrid kv cache alloctaion enabled.
+    outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        disable_hybrid_kv_cache_manager=False,
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(test_prompts)
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0
+        assert len(output.outputs[0].text.strip()) > 0
+    print(f"✓ Hybrid KV cache test passed with {len(outputs)} outputs")
+def test_hybrid_kv_cache_correctness(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test that hybrid kv cache allocation produces consistent results compared
+    to standard kv cache allocation.
+    """
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '0'
+    small_prompts = test_prompts
+    # Run baseline (no hybrid kv cache)
+    baseline_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        disable_hybrid_kv_cache_manager=True,
+    )
+    # Run with hybrid kv cache enabled.
+    hybrid_kvcache_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        disable_hybrid_kv_cache_manager=False,
+    )
+    # Compare outputs - in theory they should be identical for greedy sampling
+    # in reality there may be some differences, but overall the outputs should
+    # be very similar.
+    # an example:
+    # prompt: What is the capital of Australia?
+    # both answers should be acceptable.
+    # The capital of Australia is Canberra. It is located in the Australian Capital Territory (ACT) and is home to many
+    # Canberra is the capital of Australia. It is located in the Australian Capital Territory (ACT) and is home to
+    assert len(baseline_outputs) == len(hybrid_kvcache_outputs)
+    text_matches = 0
+    text_mismatches = 0
+    logprob_mismatches = 0
+    max_logprob_diff = 0.0
+    for i, (baseline, hybrid_kvcache_result) in enumerate(
+            zip(baseline_outputs, hybrid_kvcache_outputs)):
+        baseline_text = baseline.outputs[0].text.strip()
+        hybrid_kvcache_text = hybrid_kvcache_result.outputs[0].text.strip()
+        # Check text output
+        if baseline_text == hybrid_kvcache_text:
+            text_matches += 1
+        else:
+            text_mismatches += 1
+            print(f"Text mismatch found in prompt {i}:")
+            print(f"  Baseline: {baseline_text}")
+            print(f"  Hybrid KV Cache: {hybrid_kvcache_text}")
+        # Check log probabilities
+        baseline_logprobs = baseline.outputs[0].logprobs
+        hybrid_kvcache_logprobs = hybrid_kvcache_result.outputs[0].logprobs
+        if baseline_logprobs is not None and hybrid_kvcache_logprobs is not None:
+            # Compare log probabilities for each token
+            assert len(baseline_logprobs) == len(hybrid_kvcache_logprobs), \
+                f"Logprobs length mismatch: {len(baseline_logprobs)} vs {len(hybrid_kvcache_logprobs)}"
+            for token_idx, (base_lp, hybrid_kvcache_lp) in enumerate(
+                    zip(baseline_logprobs, hybrid_kvcache_logprobs)):
+                # Get the top logprob value for the selected token
+                if base_lp and hybrid_kvcache_lp:
+                    # Get the top token's logprob from each
+                    base_top_token = list(base_lp.keys())[0]
+                    hybrid_kvcache_top_token = list(
+                        hybrid_kvcache_lp.keys())[0]
+                    base_logprob_val = base_lp[base_top_token].logprob
+                    hybrid_kvcache_logprob_val = hybrid_kvcache_lp[
+                        hybrid_kvcache_top_token].logprob
+                    # Calculate absolute difference
+                    diff = abs(base_logprob_val - hybrid_kvcache_logprob_val)
+                    max_logprob_diff = max(max_logprob_diff, diff)
+                    # Allow small numerical differences (e.g., 1e-3)
+                    if diff > 1e-3:
+                        logprob_mismatches += 1
+                        print(
+                            f"Logprob mismatch in prompt {i}, token {token_idx}:"
+                        )
+                        print(
+                            f"  Baseline token: {base_top_token}, logprob: {base_logprob_val:.6f}"
+                        )
+                        print(
+                            f"  Hybrid KV Cache token: {hybrid_kvcache_top_token}, logprob: {hybrid_kvcache_logprob_val:.6f}"
+                        )
+                        print(f"  Difference: {diff:.6f}")
+    print("✓ Correctness test results:")
+    print(f"  Text: {text_matches} matches, {text_mismatches} mismatches")
+    print(f"  Max logprob difference: {max_logprob_diff:.6e}")
+    print(f"  Significant logprob mismatches (>1e-3): {logprob_mismatches}")
+    # Allow for some variance due to potential numerical differences
+    # but most outputs should match with greedy sampling
+    text_match_rate = text_matches / len(baseline_outputs)
+    assert text_match_rate >= 0.9, f"Text match rate {text_match_rate:.2%} is too low"
+    # Log probabilities should be very close (allow small numerical errors)
+    assert max_logprob_diff < 2, f"Max logprob difference {max_logprob_diff} is too large"

tests/e2e/test_speculative_decoding.py CHANGED Viewed

@@ -14,6 +14,7 @@
 from __future__ import annotations
+import os
 import random
 import string
 import time
@@ -22,6 +23,19 @@ import pytest
 from vllm import LLM, SamplingParams
+# TODO (Qiliang Cui): remove this when XLA fixes the recursive jit call issue.
+def _is_v7x():
+    # jax.devices() will hang so use IS_FOR_V7X to indicate the version.
+    return os.environ.get("IS_FOR_V7X", "false") == "true"
+def _get_tensor_parallel_size():
+    # Work around an XLA issue.
+    if _is_v7x():
+        return 2
+    return 1
 def get_ngram_test_prompts():
     num_prompts = 100
     prompts = []
@@ -87,7 +101,10 @@ def _test_correctness_helper(
     with monkeypatch.context():
         test_prompts = get_test_prompts(speculative_config)
-        ref_llm = LLM(model=model_name, max_model_len=1024, max_num_seqs=4)
+        ref_llm = LLM(model=model_name,
+                      max_model_len=1024,
+                      max_num_seqs=4,
+                      tensor_parallel_size=_get_tensor_parallel_size())
         ref_outputs = ref_llm.generate(test_prompts, sampling_config)
         del ref_llm
@@ -98,7 +115,8 @@ def _test_correctness_helper(
         spec_llm = LLM(model=model_name,
                        speculative_config=speculative_config,
                        max_model_len=1024,
-                       max_num_seqs=4)
+                       max_num_seqs=4,
+                       tensor_parallel_size=_get_tensor_parallel_size())
         spec_outputs = spec_llm.generate(test_prompts, sampling_config)
         matches = 0
@@ -179,7 +197,8 @@ def _test_performance_helper(
         ref_llm = LLM(model=model_name,
                       max_model_len=1024,
                       max_num_seqs=1,
-                      enable_prefix_caching=False)
+                      enable_prefix_caching=False,
+                      tensor_parallel_size=_get_tensor_parallel_size())
         start_time = time.time()
         _ = ref_llm.generate(test_prompts, sampling_config)
@@ -195,6 +214,7 @@ def _test_performance_helper(
                        speculative_config=speculative_config,
                        max_model_len=1024,
                        max_num_seqs=1,
+                       tensor_parallel_size=_get_tensor_parallel_size(),
                        enable_prefix_caching=False)
         start_time = time.time()
@@ -229,7 +249,7 @@ def test_ngram_performance_greedy(
             "prompt_lookup_max": 2,
             "prompt_lookup_min": 2,
             "num_speculative_tokens": 4,
-        }, 3.0)
+        }, 1.2 if _is_v7x() else 3.0)
 def test_ngram_performance_random(
@@ -251,7 +271,7 @@ def test_ngram_performance_random(
             "prompt_lookup_max": 2,
             "prompt_lookup_min": 2,
             "num_speculative_tokens": 4,
-        }, 3.0)
+        }, 1.5 if _is_v7x() else 3.0)
 def test_eagle3_correctness(
@@ -288,4 +308,4 @@ def test_eagle3_performance(
             "model": "unkmaster/EAGLE3-LLaMA3.1-Instruct-8B",
             "num_speculative_tokens": 2,
             "draft_tensor_parallel_size": 1
-        }, 1.8)
+        }, 1.2 if _is_v7x() else 1.8)

tests/layers/jax/test_qwix.py CHANGED Viewed

@@ -832,7 +832,7 @@ class TestGetDefaultQwixQuantizationConfig(unittest.TestCase):
         # Patch the constants in the module where the function resides
         self.patchers = [
             patch(
-                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_DEEPSEEK_FP8_CONFIG",
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG",
                 self.mock_deepseek_config),
             patch(
                 "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_LLAMA4_FP8_CONFIG",

tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py CHANGED Viewed

@@ -251,12 +251,16 @@ def test_loading_model(model, mesh):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("enable_sp", [False, True])
-def test_row_parallel_linear(model, bias, mesh, enable_sp):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_row_parallel_linear(model, bias, num_devices, enable_sp,
+                             enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(
@@ -287,12 +291,16 @@ def test_row_parallel_linear(model, bias, mesh, enable_sp):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("enable_sp", [False, True])
-def test_column_parallel_linear(model, bias, mesh, enable_sp):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_column_parallel_linear(model, bias, num_devices, enable_sp,
+                                enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(
@@ -324,13 +332,17 @@ def test_column_parallel_linear(model, bias, mesh, enable_sp):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("enable_sp", [False, True])
 @pytest.mark.parametrize("fuse_matmuls", [False, True])
-def test_qkv_parallel_linear(model, bias, mesh, enable_sp, fuse_matmuls):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_qkv_parallel_linear(model, bias, num_devices, enable_sp, fuse_matmuls,
+                             enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(
@@ -365,14 +377,17 @@ def test_qkv_parallel_linear(model, bias, mesh, enable_sp, fuse_matmuls):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("fuse_matmuls", [False, True])
 @pytest.mark.parametrize("enable_sp", [False, True])
-def test_merged_column_parallel_linear(model, bias, mesh, fuse_matmuls,
-                                       enable_sp):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_merged_column_parallel_linear(model, bias, num_devices, fuse_matmuls,
+                                       enable_sp, enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(

tests/layers/vllm/test_compressed_tensors_w8a8_int8.py CHANGED Viewed

@@ -138,12 +138,16 @@ def test_loading_model(model, mesh):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("enable_sp", [False, True])
-def test_row_parallel_linear(model, bias, mesh, enable_sp):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_row_parallel_linear(model, bias, num_devices, enable_sp,
+                             enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
@@ -209,12 +213,16 @@ def test_row_parallel_linear(model, bias, mesh, enable_sp):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("enable_sp", [False, True])
-def test_column_parallel_linear(model, bias, mesh, enable_sp):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_column_parallel_linear(model, bias, num_devices, enable_sp,
+                                enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(
@@ -280,13 +288,17 @@ def test_column_parallel_linear(model, bias, mesh, enable_sp):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("enable_sp", [False, True])
 @pytest.mark.parametrize("fuse_matmuls", [False, True])
-def test_qkv_parallel_linear(model, bias, mesh, enable_sp, fuse_matmuls):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_qkv_parallel_linear(model, bias, num_devices, enable_sp, fuse_matmuls,
+                             enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(
@@ -354,14 +366,17 @@ def test_qkv_parallel_linear(model, bias, mesh, enable_sp, fuse_matmuls):
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("mesh", [
-    test_utils.get_spmd_mesh(1),
-    test_utils.get_spmd_mesh(jax.local_device_count())
-])
+@pytest.mark.parametrize("num_devices", [1, jax.local_device_count()])
 @pytest.mark.parametrize("fuse_matmuls", [False, True])
 @pytest.mark.parametrize("enable_sp", [False, True])
-def test_merged_column_parallel_linear(model, bias, mesh, fuse_matmuls,
-                                       enable_sp):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_merged_column_parallel_linear(model, bias, num_devices, fuse_matmuls,
+                                       enable_sp, enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     dtype = torch.bfloat16
     engine_args = EngineArgs(

tests/layers/vllm/test_mxfp4.py CHANGED Viewed

@@ -119,17 +119,22 @@ def test_quant_override(model, mesh):
     assert quant_config.mesh == mesh
-@pytest.mark.parametrize(
-    "mesh", [test_utils.get_spmd_mesh(1),
-             test_utils.get_spmd_mesh(2)])
+@pytest.mark.parametrize("num_devices", [1, 2])
 @pytest.mark.parametrize("num_tokens", [8])
 @pytest.mark.parametrize("intermediate_size", [1024])
 @pytest.mark.parametrize("hidden_size", [128])
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("topk", [2])
 @pytest.mark.parametrize("use_ep", [True, False])
-def test_mxfp4_fused_moe(mesh, num_tokens, intermediate_size, hidden_size,
-                         num_experts, topk, use_ep):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_mxfp4_fused_moe(num_devices, num_tokens, intermediate_size,
+                         hidden_size, num_experts, topk, use_ep,
+                         enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     torch.manual_seed(42)
     dtype = torch.bfloat16
@@ -201,16 +206,26 @@ def test_mxfp4_fused_moe(mesh, num_tokens, intermediate_size, hidden_size,
                                    rtol=1e-1)
-@pytest.mark.parametrize(
-    "mesh", [test_utils.get_spmd_mesh(1),
-             test_utils.get_spmd_mesh(2)])
+@pytest.mark.parametrize("num_devices", [1, 2])
 @pytest.mark.parametrize("num_tokens", [8])
 @pytest.mark.parametrize("intermediate_size", [512])
 @pytest.mark.parametrize("hidden_size", [1024])
 @pytest.mark.parametrize("num_experts", [8])
 @pytest.mark.parametrize("topk", [2])
-def test_mxfp4_fused_moe_use_kernel(mesh, num_tokens, intermediate_size,
-                                    hidden_size, num_experts, topk):
+@pytest.mark.parametrize("enable_attn_dp", [False, True])
+def test_mxfp4_fused_moe_use_kernel(num_devices, num_tokens, intermediate_size,
+                                    hidden_size, num_experts, topk,
+                                    enable_attn_dp):
+    # Skip if enable_attn_dp is True but we don't have enough devices
+    if enable_attn_dp and num_devices < 2:
+        pytest.skip("enable_attn_dp requires at least 2 devices")
+    # Skip attn_dp tests for fused_moe_use_kernel since the kernel only supports 2D mesh
+    if enable_attn_dp:
+        pytest.skip(
+            "fused_moe kernel does not support attn_dp (requires 2D mesh)")
+    mesh = test_utils.get_spmd_mesh(num_devices, enable_attn_dp)
     torch.manual_seed(42)
     dtype = torch.bfloat16

tpu-inference 0.12.0.dev20251222__py3-none-any.whl → 0.12.0.dev20251224__py3-none-any.whl

tpu-inference 0.12.0.dev20251222py3-none-any.whl → 0.12.0.dev20251224py3-none-any.whl