PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tests/e2e/test_async_scheduler.py ADDED Viewed

@@ -0,0 +1,211 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import random
+import string
+import time
+import pytest
+from vllm import LLM, SamplingParams
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0,
+                          max_tokens=120,
+                          ignore_eos=True,
+                          repetition_penalty=1,
+                          frequency_penalty=0,
+                          presence_penalty=0,
+                          min_p=0,
+                          logprobs=None)
+@pytest.fixture
+def model_name():
+    return "Qwen/Qwen2.5-1.5B-Instruct"
+def get_test_prompts():
+    """
+    Generates a list of prompts with a specific word count,
+    Args:
+        num_prompts: The number of prompts to generate.
+        input_len_words: The total number of words for each prompt.
+    Returns:
+        A list of strings with number of prompts = num_prompts and
+        The total number of words for each prompt = input_len_words.
+    """
+    num_prompts = 500
+    input_len_words = 120
+    prompts = []
+    # For example w = 's'
+    # The generated prompt will be Keep repeating: s s s ...
+    num_repetitions = input_len_words
+    prefix = "Keep repeating: "
+    for _ in range(num_prompts):
+        # 1. Pick a random lowercase letter
+        w = random.choice(list(string.ascii_lowercase))
+        # 2. Create the string of repeated words
+        #    This will have (num_repetitions) words
+        repeating_part = " ".join([w] * num_repetitions)
+        # 3. Combine with the prefix (if any)
+        print(f"{prefix}{repeating_part}")
+        prompts.append(f"{prefix}{repeating_part}")
+    return prompts
+def _test_performance_helper(monkeypatch: pytest.MonkeyPatch,
+                             sampling_config: SamplingParams, model_name: str,
+                             min_speedup: float):
+    '''
+    Helper function to test async scheduler decoding performance.
+    Compares timing between reference LLM and async LLM using Qwen2.5-1.5B.
+    '''
+    with monkeypatch.context():
+        # Use a smaller set of prompts for performance testing
+        test_prompts = get_test_prompts()  # num_prompts=100, input_len=120
+        # Test reference LLM timing
+        ref_llm = LLM(model=model_name,
+                      max_model_len=800,
+                      max_num_seqs=24,
+                      max_num_batched_tokens=512,
+                      enable_prefix_caching=False,
+                      async_scheduling=0)
+        start_time = time.time()
+        _ = ref_llm.generate(test_prompts, sampling_config)
+        ref_time = time.time() - start_time
+        del ref_llm
+        # Waiting for TPUs to be released
+        time.sleep(10)
+        # # Test async LLM timing with max_num_seqs=256
+        async_llm = LLM(model=model_name,
+                        max_model_len=800,
+                        max_num_seqs=24,
+                        max_num_batched_tokens=512,
+                        enable_prefix_caching=False,
+                        async_scheduling=1)
+        start_time = time.time()
+        _ = async_llm.generate(test_prompts, sampling_config)
+        async_time = time.time() - start_time
+        del async_llm
+        # # Waiting for TPUs to be released
+        time.sleep(10)
+        speedup = ref_time / async_time
+        print(f"Reference LLM time: {ref_time:.2f}s")
+        print(f"Async LLM time: {async_time:.2f}s")
+        print(f"Speedup: {speedup:.2f}x")
+        assert speedup >= min_speedup, f"Expected at least {min_speedup}x speedup for async scheduler, got {speedup:.2f}x"
+def test_performance(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Test that async scheduler decoding provides significant performance improvement.
+    Compares timing between reference LLM and async LLM using Qwen2.5-1.5B.
+    Expects async_llm to be at least 1.3x faster than ref_llm.
+    '''
+    min_speed_up = 1.3
+    _test_performance_helper(monkeypatch, sampling_config, model_name,
+                             min_speed_up)
+def _test_correctness_helper(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Helper function to test async scheduler correctness.
+    Compare the outputs of a original LLM and a async LLM
+    should be the same when using async scheduler decoding.
+    Known Edge Case (KV Cache Swapping):
+    Under this case, though the temperature is set to 0,
+    the output is still slightly different everytime.
+    This is an expected behaviour as the normal scheduler also
+    behaves the same and hence, it is difficult to design a test
+    for such scenario.
+    '''
+    with monkeypatch.context():
+        test_prompts = get_test_prompts()
+        ref_llm = LLM(model=model_name,
+                      max_model_len=1024,
+                      max_num_seqs=100,
+                      async_scheduling=0)
+        ref_outputs = ref_llm.generate(test_prompts, sampling_config)
+        del ref_llm
+        # Waiting for TPUs to be released.
+        time.sleep(10)
+        async_llm = LLM(model=model_name,
+                        max_model_len=1024,
+                        max_num_seqs=100,
+                        async_scheduling=1)
+        async_outputs = async_llm.generate(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, async_output in zip(ref_outputs, async_outputs):
+            if ref_output.outputs[0].text == async_output.outputs[0].text:
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"async_output: {async_output.outputs[0].text}")
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"async_output: {async_output.outputs[0].text}")
+        assert misses == 0
+        del async_outputs
+        # Waiting for TPUs to be released.
+        time.sleep(10)
+def test_async_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a async LLM
+    should be the same when using async scheduler.
+    '''
+    _test_correctness_helper(monkeypatch, sampling_config, model_name)

tests/e2e/test_data_parallel.py ADDED Viewed

@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import time
+from dataclasses import asdict
+import pytest
+from vllm import LLM, EngineArgs, SamplingParams
+@pytest.fixture(autouse=True)
+def setup_new_model_design():
+    """Automatically set NEW_MODEL_DESIGN=1 for all tests."""
+    os.environ['NEW_MODEL_DESIGN'] = '1'
+@pytest.fixture
+def test_prompts():
+    """Simple test prompts for data parallelism testing."""
+    return [
+        "Hello, my name is",
+        "The capital of France is",
+        "The colors of the rainbow are",
+        "The future of AI is",
+        "The president of the United States is",
+        "How many players are on a standard soccer team?",
+        "In Greek mythology, who is the god of the sea?",
+        "What is the capital of Australia?",
+        "What is the largest planet in our solar system?",
+        "Who developed the theory of general relativity?",
+    ]
+@pytest.fixture
+def sampling_params():
+    """Standard sampling parameters for testing."""
+    return SamplingParams(
+        temperature=0.0,
+        max_tokens=32,
+        ignore_eos=True,
+        logprobs=1,
+    )
+def _run_inference_with_config(model_name: str,
+                               test_prompts: list,
+                               sampling_params: SamplingParams,
+                               tensor_parallel_size: int = 1,
+                               data_parallel_size: int = 1,
+                               additional_config: dict = {},
+                               kv_cache_dtype: str = "auto",
+                               enable_prefix_caching: bool = False,
+                               async_scheduling: bool = False,
+                               measure_time: bool = False,
+                               max_model_len: int = 32,
+                               max_num_batched_tokens: int = 128,
+                               max_num_seqs: int = 16):
+    """Helper function to run inference with specified configuration.
+    Returns:
+        If measure_time=True: (outputs, elapsed_time) tuple
+        If measure_time=False: outputs list
+    """
+    # Create LLM args using parser-based approach similar to offline_inference.py
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=max_model_len,
+        tensor_parallel_size=tensor_parallel_size,
+        data_parallel_size=data_parallel_size,
+        gpu_memory_utilization=0.98,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+        enable_prefix_caching=enable_prefix_caching,
+        additional_config=additional_config,
+        kv_cache_dtype=kv_cache_dtype,
+        async_scheduling=async_scheduling,
+    )
+    engine_args_dict = asdict(engine_args)
+    llm = LLM(**engine_args_dict)
+    try:
+        start_time = time.time()
+        outputs = llm.generate(test_prompts, sampling_params)
+        elapsed_time = time.time() - start_time
+        if measure_time:
+            return outputs, elapsed_time
+        else:
+            return outputs
+    finally:
+        del llm
+        # Wait for TPUs to be released
+        time.sleep(5)
+def test_data_parallelism_performance(sampling_params: SamplingParams, ):
+    """
+    Test that data parallelism provides performance improvements compared to baseline.
+    This test measures the execution time with 128 prompts of length ~1k tokens.
+    Note: This is a performance benchmark test with large prompts.
+    """
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '1'
+    os.environ['SKIP_JAX_PRECOMPILE'] = '0'
+    os.environ['MODEL_IMPL_TYPE'] = 'flax_nnx'
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    # Generate 128 prompts of approximately 1k tokens each
+    # Creating a base prompt of about 1k tokens using repeated text
+    base_text = (
+        "The rapid advancement of artificial intelligence has transformed numerous industries "
+        "and continues to reshape our understanding of technology's potential. Machine learning "
+        "algorithms have become increasingly sophisticated, enabling computers to perform tasks "
+        "that were once thought to require human intelligence. From natural language processing "
+        "to computer vision, AI systems are now capable of understanding context, recognizing "
+        "patterns, and making decisions with remarkable accuracy. " *
+        20  # Repeat to reach ~1k tokens
+    )
+    # Create 128 prompts with slight variations
+    long_prompts = [
+        f"Prompt {i}: {base_text} What are your thoughts on this topic?"
+        for i in range(128)
+    ]
+    print(
+        f"Generated {len(long_prompts)} prompts, approximate length: {len(base_text.split())} tokens each"
+    )
+    # Configuration for long sequences
+    max_model_len = 2048
+    max_num_batched_tokens = 4096
+    max_num_seqs = 64
+    # Run baseline (no data parallelism) with timing
+    baseline_outputs, baseline_time = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=long_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        async_scheduling=True,
+        measure_time=True,
+        max_model_len=max_model_len,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+    )
+    # Run with model data parallelism and async scheduling with timing
+    dp_outputs, dp_time = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=long_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        data_parallel_size=2,
+        async_scheduling=True,
+        measure_time=True,
+        max_model_len=max_model_len,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+    )
+    # Calculate speedup
+    speedup = baseline_time / dp_time if dp_time > 0 else 0
+    print("✓ Performance test results:")
+    print(f"  Number of prompts: {len(long_prompts)}")
+    print(f"  Baseline time: {baseline_time:.2f}s")
+    print(f"  Data parallel time: {dp_time:.2f}s")
+    print(f"  Speedup: {speedup:.2f}x")
+    print(
+        f"  Baseline throughput: {len(long_prompts)/baseline_time:.2f} prompts/s"
+    )
+    print(
+        f"  Data parallel throughput: {len(long_prompts)/dp_time:.2f} prompts/s"
+    )
+@pytest.mark.parametrize("model_impl_type", ["vllm", "flax_nnx"])
+def test_model_data_parallelism(
+    test_prompts: list,
+    sampling_params: SamplingParams,
+    model_impl_type: str,
+):
+    """
+    Test model-wise data parallelism where data=2 in the mesh axis.
+    This test verifies that the model can run with data parallelism enabled,
+    duplicating the entire model across 2 data parallel workers.
+    Equivalent to:
+    python examples/offline_inference.py --tensor_parallel_size=4 --data_parallel_size=2
+    """
+    # Use Llama 1B for this test
+    test_model = "meta-llama/Llama-3.2-1B-Instruct"
+    os.environ['MODEL_IMPL_TYPE'] = model_impl_type
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '0'
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'
+    # Test with data parallelism enabled
+    outputs = _run_inference_with_config(
+        model_name=test_model,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        data_parallel_size=2,
+        async_scheduling=False,
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(
+        test_prompts
+    ), f"Expected {len(test_prompts)} outputs, got {len(outputs)}"
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0, "Output has no generated text"
+        assert len(
+            output.outputs[0].text.strip()) > 0, "Generated text is empty"
+    print(f"✓ Model data parallelism test passed with {len(outputs)} outputs")
+def test_attention_data_parallelism(
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test attention data parallelism where only the attention layer gets duplicated,
+    attn_dp=2 in the mesh axis. This is useful when num_kv_heads < TP to avoid
+    wasting KV cache memory.
+    Equivalent to:
+    python examples/offline_inference.py --tensor_parallel_size=4 --kv-cache-dtype=fp8 \
+        --additional_config='{"sharding":{"sharding_strategy": {"enable_dp_attention":1}}}'
+    """
+    # Use Qwen3 0.6B for this test with reduced tensor parallelism
+    test_model = "Qwen/Qwen3-0.6B"
+    os.environ['MODEL_IMPL_TYPE'] = "flax_nnx"
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '0'
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'
+    additional_config = {
+        "sharding": {
+            "sharding_strategy": {
+                "enable_dp_attention": 1
+            }
+        }
+    }
+    # Test with attention data parallelism enabled
+    # Reduced tensor_parallel_size from 8 to 4 to avoid memory exhaustion
+    outputs = _run_inference_with_config(
+        model_name=test_model,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=4,
+        data_parallel_size=1,
+        additional_config=additional_config,
+        kv_cache_dtype="fp8",
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(
+        test_prompts
+    ), f"Expected {len(test_prompts)} outputs, got {len(outputs)}"
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0, "Output has no generated text"
+        assert len(
+            output.outputs[0].text.strip()) > 0, "Generated text is empty"
+    print(
+        f"✓ Attention data parallelism test passed with {len(outputs)} outputs"
+    )
+def test_data_parallelism_correctness(
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test that data parallelism produces consistent results compared to a baseline.
+    This test compares outputs from a single-device run with data parallel runs
+    to ensure correctness, including log probabilities.
+    """
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '0'
+    os.environ['MODEL_IMPL_TYPE'] = "flax_nnx"
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    # Use a smaller subset of prompts for correctness testing
+    small_prompts = test_prompts[:10]
+    # Run baseline (no data parallelism)
+    baseline_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        data_parallel_size=1,
+        async_scheduling=True,
+    )
+    # Run with model data parallelism and async scheduling
+    dp_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        data_parallel_size=2,
+        async_scheduling=True,
+    )
+    # Compare outputs - they should be identical for greedy sampling
+    assert len(baseline_outputs) == len(dp_outputs)
+    text_matches = 0
+    text_mismatches = 0
+    logprob_mismatches = 0
+    max_logprob_diff = 0.0
+    for i, (baseline, dp_result) in enumerate(zip(baseline_outputs,
+                                                  dp_outputs)):
+        baseline_text = baseline.outputs[0].text.strip()
+        dp_text = dp_result.outputs[0].text.strip()
+        # Check text output
+        if baseline_text == dp_text:
+            text_matches += 1
+        else:
+            text_mismatches += 1
+            print(f"Text mismatch found in prompt {i}:")
+            print(f"  Baseline: {baseline_text}")
+            print(f"  Data Parallel: {dp_text}")
+        # Check log probabilities
+        baseline_logprobs = baseline.outputs[0].logprobs
+        dp_logprobs = dp_result.outputs[0].logprobs
+        if baseline_logprobs is not None and dp_logprobs is not None:
+            # Compare log probabilities for each token
+            assert len(baseline_logprobs) == len(dp_logprobs), \
+                f"Logprobs length mismatch: {len(baseline_logprobs)} vs {len(dp_logprobs)}"
+            for token_idx, (base_lp, dp_lp) in enumerate(
+                    zip(baseline_logprobs, dp_logprobs)):
+                # Get the top logprob value for the selected token
+                if base_lp and dp_lp:
+                    # Get the top token's logprob from each
+                    base_top_token = list(base_lp.keys())[0]
+                    dp_top_token = list(dp_lp.keys())[0]
+                    base_logprob_val = base_lp[base_top_token].logprob
+                    dp_logprob_val = dp_lp[dp_top_token].logprob
+                    # Calculate absolute difference
+                    diff = abs(base_logprob_val - dp_logprob_val)
+                    max_logprob_diff = max(max_logprob_diff, diff)
+                    # Allow small numerical differences
+                    if diff > 0.15:
+                        logprob_mismatches += 1
+                        print(
+                            f"Logprob mismatch in prompt {i}, token {token_idx}:"
+                        )
+                        print(
+                            f"  Baseline token: {base_top_token}, logprob: {base_logprob_val:.6f}"
+                        )
+                        print(
+                            f"  DP token: {dp_top_token}, logprob: {dp_logprob_val:.6f}"
+                        )
+                        print(f"  Difference: {diff:.6f}")
+    print("✓ Correctness test results:")
+    print(f"  Text: {text_matches} matches, {text_mismatches} mismatches")
+    print(f"  Max logprob difference: {max_logprob_diff:.6e}")
+    print(f"  Significant logprob mismatches (>0.15): {logprob_mismatches}")
+    # Allow for some variance due to potential numerical differences
+    # but most outputs should match with greedy sampling
+    text_match_rate = text_matches / len(baseline_outputs)
+    assert text_match_rate >= 0.9, f"Text match rate {text_match_rate:.2%} is too low"
+    # Log probabilities should be very close (allow small numerical errors)
+    assert max_logprob_diff < 0.15, f"Max logprob difference {max_logprob_diff} is too large"
+    # Log probabilities should be very close (allow small numerical errors)
+    assert max_logprob_diff < 0.15, f"Max logprob difference {max_logprob_diff} is too large"