PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (248) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +14 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +14 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +180 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +25 -8
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +14 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +20 -3
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +171 -163
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +20 -26
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +112 -69
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +85 -65
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +374 -194
tpu_inference/kernels/ragged_paged_attention/v3/util.py +13 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +22 -3
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +52 -27
tpu_inference/layers/jax/attention/gpt_oss_attention.py +19 -6
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +100 -455
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +19 -5
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +119 -132
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +38 -26
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +133 -220
tpu_inference/layers/vllm/quantization/unquantized.py +154 -253
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +37 -16
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +113 -124
tpu_inference/models/jax/gpt_oss.py +23 -7
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +85 -24
tpu_inference/models/jax/utils/weight_utils.py +32 -1
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +27 -29
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +69 -35
tpu_inference/runner/kv_cache.py +14 -0
tpu_inference/runner/kv_cache_manager.py +15 -2
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +30 -10
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +31 -30
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +23 -7
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +1 -1
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -208
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.12.0.dev20251213.dist-info/RECORD +0 -175
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tests/e2e/test_model_loader.py ADDED Viewed

@@ -0,0 +1,268 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# tests/e2e/test_model_loader.py
+import os
+import re
+import signal
+import subprocess
+import sys
+import tempfile
+import time
+import pytest
+import requests
+import torch
+from flax import nnx
+from vllm.model_executor.models.registry import ModelRegistry
+from tpu_inference.models.common.model_loader import (_MODEL_REGISTRY,
+                                                      register_model)
+@pytest.fixture
+def cleanup_registries():
+    """Cleans up the model registries before and after each test."""
+    _MODEL_REGISTRY.clear()
+    # vLLM's ModelRegistry uses a class-level dictionary to store model classes.
+    # We need to clear it to ensure test isolation.
+    if hasattr(ModelRegistry, "models"):
+        ModelRegistry.models.clear()
+    yield
+    _MODEL_REGISTRY.clear()
+    if hasattr(ModelRegistry, "models"):
+        ModelRegistry.models.clear()
+class DummyGoodModel(nnx.Module):
+    """A valid model that conforms to the expected interface."""
+    def __init__(self, vllm_config=None, rng=None, mesh=None):
+        pass
+    def __call__(self,
+                 kv_caches=None,
+                 input_ids=None,
+                 attention_metadata=None):
+        pass
+def test_register_model_success(cleanup_registries):
+    """Tests that a valid model is registered successfully."""
+    arch = "DummyGoodModelForCausalLM"
+    register_model(arch, DummyGoodModel)
+    # Check tpu_inference registry
+    assert arch in _MODEL_REGISTRY
+    class MockModelConfig:
+        def __init__(self, architectures):
+            self.hf_config = self._MockHfConfig(architectures)
+            self.model_impl = "flax_nnx"
+        class _MockHfConfig:
+            def __init__(self, architectures):
+                self.architectures = architectures
+    model_config = MockModelConfig(architectures=[arch])
+    vllm_compatible_model, _ = ModelRegistry.resolve_model_cls(
+        architectures=[arch], model_config=model_config)
+    assert vllm_compatible_model is not None
+    assert issubclass(vllm_compatible_model, torch.nn.Module)
+    assert issubclass(vllm_compatible_model, DummyGoodModel)
+try:
+    # Attempt to import vLLM's interface validation function
+    from vllm.model_executor.models.interfaces_base import is_vllm_model
+    VLLM_INTERFACE_CHECK_AVAILABLE = True
+except ImportError:
+    VLLM_INTERFACE_CHECK_AVAILABLE = False
+@pytest.mark.skipif(not VLLM_INTERFACE_CHECK_AVAILABLE,
+                    reason="is_vllm_model could not be imported from vllm.")
+def test_registered_model_passes_vllm_interface_check(cleanup_registries):
+    """
+    Ensures the wrapped model passes vLLM's own interface validation.
+    This test is future-proof. If vLLM adds new requirements to its
+    model interface, this test will fail, signaling that the wrapper
+    in `register_model` needs to be updated.
+    """
+    arch = "DummyGoodModelForCausalLM"
+    register_model(arch, DummyGoodModel)
+    class MockModelConfig:
+        def __init__(self, architectures):
+            self.hf_config = self._MockHfConfig(architectures)
+            self.model_impl = "flax_nnx"
+        class _MockHfConfig:
+            def __init__(self, architectures):
+                self.architectures = architectures
+    model_config = MockModelConfig(architectures=[arch])
+    vllm_compatible_model, _ = ModelRegistry.resolve_model_cls(
+        architectures=[arch], model_config=model_config)
+    # This directly uses vLLM's checker, so it's always up-to-date.
+    # We assume is_vllm_model returns True for a valid model, and either
+    # returns False or raises an exception for an invalid one.
+    assert is_vllm_model(vllm_compatible_model)
+def _run_server_and_bench(model_name: str, model_impl_type: str,
+                          port: int) -> float:
+    env = os.environ.copy()
+    env["MODEL_IMPL_TYPE"] = model_impl_type
+    # Start server
+    server_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.cli.main",
+        "serve",
+        model_name,
+        "--port",
+        str(port),
+        "--max-model-len",
+        "2048",
+        "--tensor-parallel-size",
+        "1",
+        "--disable-log-requests",
+        "--no-enable-prefix-caching",
+        "--gpu-memory-utilization",
+        "0.90",
+    ]
+    print(f"Starting server ({model_impl_type}) on port {port}...")
+    # Use a new process group so we can kill the server and its children
+    # Use temporary files for stdout/stderr to avoid pipe buffer deadlocks
+    stdout_file = tempfile.TemporaryFile(mode='w+b')
+    stderr_file = tempfile.TemporaryFile(mode='w+b')
+    server_process = subprocess.Popen(server_cmd,
+                                      env=env,
+                                      stdout=stdout_file,
+                                      stderr=stderr_file,
+                                      preexec_fn=os.setsid)
+    try:
+        # Wait for server to be ready
+        start_time = time.time()
+        server_ready = False
+        while time.time() - start_time < 600:  # 10 minutes timeout
+            try:
+                if requests.get(
+                        f"http://localhost:{port}/health").status_code == 200:
+                    server_ready = True
+                    break
+            except requests.exceptions.RequestException:
+                pass
+            if server_process.poll() is not None:
+                stdout_file.seek(0)
+                stderr_file.seek(0)
+                stdout = stdout_file.read().decode("utf-8", errors="replace")
+                stderr = stderr_file.read().decode("utf-8", errors="replace")
+                raise RuntimeError(
+                    f"Server process exited unexpectedly.\nStdout: {stdout}\nStderr: {stderr}"
+                )
+            time.sleep(5)
+        if not server_ready:
+            stdout_file.seek(0)
+            stderr_file.seek(0)
+            stdout = stdout_file.read().decode("utf-8", errors="replace")
+            stderr = stderr_file.read().decode("utf-8", errors="replace")
+            raise RuntimeError(
+                f"Server failed to start within timeout.\nStdout: {stdout}\nStderr: {stderr}"
+            )
+        print("Server is ready. Running benchmark...")
+        # Run benchmark
+        bench_cmd = [
+            "vllm", "bench", "serve", "--model", model_name, "--port",
+            str(port), "--dataset-name", "random", "--random-input-len", "50",
+            "--random-output-len", "128", "--num-prompts", "20"
+        ]
+        result = subprocess.run(bench_cmd,
+                                env=env,
+                                capture_output=True,
+                                text=True)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Benchmark failed.\nStdout: {result.stdout}\nStderr: {result.stderr}"
+            )
+        # Parse throughput
+        # Output example: "Request throughput (req/s): 12.34"
+        match = re.search(r"Request throughput \(req/s\):\s+([\d\.]+)",
+                          result.stdout)
+        if not match:
+            raise ValueError(
+                f"Could not parse throughput from output:\n{result.stdout}")
+        throughput = float(match.group(1))
+        return throughput
+    finally:
+        print("Stopping server...")
+        try:
+            os.killpg(os.getpgid(server_process.pid), signal.SIGTERM)
+        except ProcessLookupError:
+            pass
+        server_process.wait()
+        stdout_file.close()
+        stderr_file.close()
+        # Wait for TPU cleanup
+        time.sleep(5)
+def test_flax_nnx_vs_vllm_performance():
+    """
+    Compares the performance of flax_nnx and vllm model implementations.
+    This test ensures that the JAX-native (`flax_nnx`) implementation's
+    performance is not significantly different from the vLLM-native PyTorch
+    (`vllm`) implementation. It measures the request throughput for both
+    backends and asserts that the percentage
+    difference is within a reasonable threshold.
+    """
+    model_name = "Qwen/Qwen3-4B"
+    # This should be 2-3% but 6% reduces flakiness.
+    percentage_difference_threshold = 0.06
+    throughput_vllm = _run_server_and_bench(model_name, "vllm", 8001)
+    throughput_flax = _run_server_and_bench(model_name, "flax_nnx", 8002)
+    print(f"vLLM (PyTorch) throughput: {throughput_vllm:.2f} req/s.")
+    print(f"flax_nnx (JAX) throughput: {throughput_flax:.2f} req/s.")
+    percentage_diff = abs(throughput_flax - throughput_vllm) / throughput_vllm
+    print(f"Percentage difference in throughput: {percentage_diff:.2%}.")
+    assert percentage_diff < percentage_difference_threshold, (
+        f"The performance difference between flax_nnx and vllm is too high. "
+        f"Difference: {percentage_diff:.2%}, Threshold: {percentage_difference_threshold:.2%}"
+    )

tests/e2e/test_multi_modal_inference.py ADDED Viewed

@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# A simplified example to run multi-modal inference and verify the output.
+# This script is a self-contained test that runs a single prompt and
+# compares the output to a known-good output.
+import difflib
+import os
+from dataclasses import asdict
+import pytest
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import convert_image_mode
+# Expected partial text output from the model. This is based on a previous
+# run and is used for verification. The test is considered passed if the
+# generated output match with this text.
+EXPECTED_TEXT = (
+    "The image depicts a tall, cylindrical tower with a lattice-like structure, surrounded by cherry blossom trees in full bloom. The cherry blossoms are in various stages of opening, with pink petals covering the branches. The sky is clear and blue, providing a vibrant backdrop to the scene. The tower appears to be a significant landmark"
+)
+# NOTE: Could be extended to more mm models/configs as needed
+@pytest.mark.parametrize("enable_dynamic_image_sizes", [False, True])
+def test_multi_modal_inference(monkeypatch, enable_dynamic_image_sizes):
+    """
+    Runs multi-modal inference and verifies the output.
+    """
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'  # Skip warmup to save time.
+    os.environ[
+        'VLLM_XLA_CHECK_RECOMPILATION'] = '0'  # Allow compilation during execution.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    # --- Configuration ---
+    model = "Qwen/Qwen2.5-VL-3B-Instruct"
+    tensor_parallel_size = 1
+    temperature = 0.0
+    max_tokens = 64
+    max_model_len = 4096
+    gpu_memory_utilization = 0.5
+    modality = "image"
+    print("Preparing for multi-modal inference...")
+    # --- Prepare Inputs ---
+    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+    question = "What is the content of this image?"
+    # Using Qwen2.5-VL prompt template
+    # NOTE: other models may be different
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    # --- Setup vLLM Engine ---
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=max_model_len,
+        tensor_parallel_size=tensor_parallel_size,
+        gpu_memory_utilization=gpu_memory_utilization,
+        max_num_seqs=1,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    engine_args = asdict(engine_args)
+    if engine_args.get("additional_config") is None:
+        engine_args["additional_config"] = {}
+    engine_args["additional_config"][
+        "enable_dynamic_image_sizes"] = enable_dynamic_image_sizes
+    llm = LLM(**engine_args)
+    sampling_params = SamplingParams(
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    inputs = {
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": image
+        },
+    }
+    # --- Run Inference ---
+    print("Running inference...")
+    outputs = llm.generate(inputs, sampling_params)
+    # --- Verification ---
+    generated_text = outputs[0].outputs[0].text.strip()
+    print("-" * 50)
+    print("Generated Text:")
+    print(generated_text)
+    print("-" * 50)
+    # Check output
+    similarity_score = difflib.SequenceMatcher(None, generated_text,
+                                               EXPECTED_TEXT).ratio()
+    print(f"Similarity Score: {similarity_score:.4f}")
+    assert similarity_score >= 0.85, (
+        f"Text similarity too low ({similarity_score:.2f}).\n"
+        f"Expected: {EXPECTED_TEXT}\n"
+        f"Actual:   {generated_text}")

tests/e2e/test_pipeline_parallel.py ADDED Viewed

@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import time
+from dataclasses import asdict
+import pytest
+from vllm import LLM, EngineArgs, SamplingParams
+@pytest.fixture
+def model_name():
+    """Choose LLama3 8b as the test model as it supports PP on jax model impl."""
+    return "meta-llama/Llama-3.1-8B-Instruct"
+@pytest.fixture
+def test_prompts():
+    """Simple test prompts for data parallelism testing."""
+    return [
+        "Hello, my name is",
+        "The capital of France is",
+        "The colors of the rainbow are",
+        "The future of AI is",
+        "The president of the United States is",
+        "How many players are on a standard soccer team?",
+        "In Greek mythology, who is the god of the sea?",
+        "What is the capital of Australia?",
+        "What is the largest planet in our solar system?",
+        "Who developed the theory of general relativity?",
+    ]
+@pytest.fixture
+def sampling_params():
+    """Standard sampling parameters for testing."""
+    return SamplingParams(
+        temperature=0.0,
+        max_tokens=32,
+        ignore_eos=True,
+        logprobs=1,
+    )
+def _run_inference_with_config(model_name: str,
+                               test_prompts: list,
+                               sampling_params: SamplingParams,
+                               tensor_parallel_size: int = 1,
+                               pipeline_parallel_size: int = 1,
+                               additional_config: dict = {},
+                               kv_cache_dtype: str = "auto",
+                               enable_prefix_caching: bool = False) -> list:
+    """Helper function to run inference with specified configuration."""
+    # Create LLM args using parser-based approach similar to offline_inference.py
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=128,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        gpu_memory_utilization=0.95,
+        max_num_batched_tokens=128,
+        max_num_seqs=16,
+        enable_prefix_caching=enable_prefix_caching,
+        additional_config=additional_config,
+        kv_cache_dtype=kv_cache_dtype,
+    )
+    engine_args_dict = asdict(engine_args)
+    llm = LLM(**engine_args_dict)
+    try:
+        outputs = llm.generate(test_prompts, sampling_params)
+        return outputs
+    finally:
+        del llm
+        # Wait for TPUs to be released
+        time.sleep(5)
+@pytest.mark.skip(reason="PP is not fully enabled.")
+def test_pipeline_parallelism_jax_model(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test pipline parallelism works on Jax models
+    Equivalent to:
+    python examples/offline_inference.py --tensor_parallel_size=1 --pipeline_parallel_size=2
+    """
+    # Test with pipeline parallelism enabled
+    outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=2,
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(test_prompts)
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0
+        assert len(output.outputs[0].text.strip()) > 0
+    print(
+        f"✓ Pipeline Parallelism Jax model test passed with {len(outputs)} outputs"
+    )
+@pytest.mark.skip(reason="PP is not fully enabled.")
+def test_pipeline_parallelism_vllm_model(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test pipline parallelism works on vLLM models, and it also works with
+    with tensor parallelism.
+    Equivalent to:
+    MODEL_IMPL_TYPE=vllm python examples/offline_inference.py --tensor_parallel_size=1 --pipeline_parallel_size=2
+    """
+    os.environ['MODEL_IMPL_TYPE'] = 'vllm'
+    # Test with data parallelism enabled
+    outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=2,
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(test_prompts)
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0
+        assert len(output.outputs[0].text.strip()) > 0
+    print(
+        f"✓ Pipeline Parallelism vLLM model test passed with {len(outputs)} outputs"
+    )
+@pytest.mark.skip(reason="PP is not fully enabled.")
+def test_pipeline_parallelism_jax_model_correctness(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test that pipeline parallelism produces consistent results compared to a baseline.
+    This test compares outputs from a single-device run with pipeline parallel runs
+    to ensure correctness, including log probabilities.
+    """
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '0'
+    # Use a smaller subset of prompts for correctness testing
+    small_prompts = test_prompts[:10]
+    # Run baseline (no PP)
+    baseline_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+    # Run with model data parallelism and async scheduling
+    pp_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=2,
+    )
+    # Compare outputs - in theory they should be identical for greedy sampling
+    # in reality there may be some differences, but overall the outputs should
+    # be very similar.
+    # an example:
+    # prompt: What is the capital of Australia?
+    # both answers should be acceptable.
+    # The capital of Australia is Canberra. It is located in the Australian Capital Territory (ACT) and is home to many
+    # Canberra is the capital of Australia. It is located in the Australian Capital Territory (ACT) and is home to
+    assert len(baseline_outputs) == len(pp_outputs)
+    text_matches = 0
+    text_mismatches = 0
+    logprob_mismatches = 0
+    max_logprob_diff = 0.0
+    for i, (baseline, pp_result) in enumerate(zip(baseline_outputs,
+                                                  pp_outputs)):
+        baseline_text = baseline.outputs[0].text.strip()
+        pp_text = pp_result.outputs[0].text.strip()
+        # Check text output
+        if baseline_text == pp_text:
+            text_matches += 1
+        else:
+            text_mismatches += 1
+            print(f"Text mismatch found in prompt {i}:")
+            print(f"  Baseline: {baseline_text}")
+            print(f"  Pipeline Parallel: {pp_text}")
+        # Check log probabilities
+        baseline_logprobs = baseline.outputs[0].logprobs
+        pp_logprobs = pp_result.outputs[0].logprobs
+        if baseline_logprobs is not None and pp_logprobs is not None:
+            # Compare log probabilities for each token
+            assert len(baseline_logprobs) == len(pp_logprobs), \
+                f"Logprobs length mismatch: {len(baseline_logprobs)} vs {len(pp_logprobs)}"
+            for token_idx, (base_lp, pp_lp) in enumerate(
+                    zip(baseline_logprobs, pp_logprobs)):
+                # Get the top logprob value for the selected token
+                if base_lp and pp_lp:
+                    # Get the top token's logprob from each
+                    base_top_token = list(base_lp.keys())[0]
+                    pp_top_token = list(pp_lp.keys())[0]
+                    base_logprob_val = base_lp[base_top_token].logprob
+                    pp_logprob_val = pp_lp[pp_top_token].logprob
+                    # Calculate absolute difference
+                    diff = abs(base_logprob_val - pp_logprob_val)
+                    max_logprob_diff = max(max_logprob_diff, diff)
+                    # Allow small numerical differences (e.g., 1e-3)
+                    if diff > 1e-3:
+                        logprob_mismatches += 1
+                        print(
+                            f"Logprob mismatch in prompt {i}, token {token_idx}:"
+                        )
+                        print(
+                            f"  Baseline token: {base_top_token}, logprob: {base_logprob_val:.6f}"
+                        )
+                        print(
+                            f"  PP token: {pp_top_token}, logprob: {pp_logprob_val:.6f}"
+                        )
+                        print(f"  Difference: {diff:.6f}")
+    print("✓ Correctness test results:")
+    print(f"  Text: {text_matches} matches, {text_mismatches} mismatches")
+    print(f"  Max logprob difference: {max_logprob_diff:.6e}")
+    print(f"  Significant logprob mismatches (>1e-3): {logprob_mismatches}")
+    # Allow for some variance due to potential numerical differences
+    # but most outputs should match with greedy sampling
+    text_match_rate = text_matches / len(baseline_outputs)
+    assert text_match_rate >= 0.9, f"Text match rate {text_match_rate:.2%} is too low"
+    # Log probabilities should be very close (allow small numerical errors)
+    assert max_logprob_diff < 1, f"Max logprob difference {max_logprob_diff} is too large"

tpu-inference 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl