PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tests/e2e/test_pipeline_parallel.py ADDED Viewed

@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import time
+from dataclasses import asdict
+import pytest
+from vllm import LLM, EngineArgs, SamplingParams
+@pytest.fixture
+def model_name():
+    """Choose LLama3 8b as the test model as it supports PP on jax model impl."""
+    return "meta-llama/Llama-3.1-8B-Instruct"
+@pytest.fixture
+def test_prompts():
+    """Simple test prompts for data parallelism testing."""
+    return [
+        "Hello, my name is",
+        "The capital of France is",
+        "The colors of the rainbow are",
+        "The future of AI is",
+        "The president of the United States is",
+        "How many players are on a standard soccer team?",
+        "In Greek mythology, who is the god of the sea?",
+        "What is the capital of Australia?",
+        "What is the largest planet in our solar system?",
+        "Who developed the theory of general relativity?",
+    ]
+@pytest.fixture
+def sampling_params():
+    """Standard sampling parameters for testing."""
+    return SamplingParams(
+        temperature=0.0,
+        max_tokens=32,
+        ignore_eos=True,
+        logprobs=1,
+    )
+def _run_inference_with_config(model_name: str,
+                               test_prompts: list,
+                               sampling_params: SamplingParams,
+                               tensor_parallel_size: int = 1,
+                               pipeline_parallel_size: int = 1,
+                               additional_config: dict = {},
+                               kv_cache_dtype: str = "auto",
+                               enable_prefix_caching: bool = False) -> list:
+    """Helper function to run inference with specified configuration."""
+    # Create LLM args using parser-based approach similar to offline_inference.py
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=128,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        gpu_memory_utilization=0.95,
+        max_num_batched_tokens=128,
+        max_num_seqs=16,
+        enable_prefix_caching=enable_prefix_caching,
+        additional_config=additional_config,
+        kv_cache_dtype=kv_cache_dtype,
+    )
+    engine_args_dict = asdict(engine_args)
+    llm = LLM(**engine_args_dict)
+    try:
+        outputs = llm.generate(test_prompts, sampling_params)
+        return outputs
+    finally:
+        del llm
+        # Wait for TPUs to be released
+        time.sleep(5)
+@pytest.mark.skip(reason="PP is not fully enabled.")
+def test_pipeline_parallelism_jax_model(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test pipline parallelism works on Jax models
+    Equivalent to:
+    python examples/offline_inference.py --tensor_parallel_size=1 --pipeline_parallel_size=2
+    """
+    # Test with pipeline parallelism enabled
+    outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=2,
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(test_prompts)
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0
+        assert len(output.outputs[0].text.strip()) > 0
+    print(
+        f"✓ Pipeline Parallelism Jax model test passed with {len(outputs)} outputs"
+    )
+@pytest.mark.skip(reason="PP is not fully enabled.")
+def test_pipeline_parallelism_vllm_model(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test pipline parallelism works on vLLM models, and it also works with
+    with tensor parallelism.
+    Equivalent to:
+    MODEL_IMPL_TYPE=vllm python examples/offline_inference.py --tensor_parallel_size=1 --pipeline_parallel_size=2
+    """
+    os.environ['MODEL_IMPL_TYPE'] = 'vllm'
+    # Test with data parallelism enabled
+    outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=test_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=2,
+    )
+    # Verify we got outputs for all prompts
+    assert len(outputs) == len(test_prompts)
+    # Verify each output has generated text
+    for output in outputs:
+        assert len(output.outputs) > 0
+        assert len(output.outputs[0].text.strip()) > 0
+    print(
+        f"✓ Pipeline Parallelism vLLM model test passed with {len(outputs)} outputs"
+    )
+@pytest.mark.skip(reason="PP is not fully enabled.")
+def test_pipeline_parallelism_jax_model_correctness(
+    model_name: str,
+    test_prompts: list,
+    sampling_params: SamplingParams,
+):
+    """
+    Test that pipeline parallelism produces consistent results compared to a baseline.
+    This test compares outputs from a single-device run with pipeline parallel runs
+    to ensure correctness, including log probabilities.
+    """
+    os.environ['SKIP_JAX_PRECOMPILE'] = '1'
+    os.environ['VLLM_XLA_CHECK_RECOMPILATION'] = '0'
+    # Use a smaller subset of prompts for correctness testing
+    small_prompts = test_prompts[:10]
+    # Run baseline (no PP)
+    baseline_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+    # Run with model data parallelism and async scheduling
+    pp_outputs = _run_inference_with_config(
+        model_name=model_name,
+        test_prompts=small_prompts,
+        sampling_params=sampling_params,
+        tensor_parallel_size=1,
+        pipeline_parallel_size=2,
+    )
+    # Compare outputs - in theory they should be identical for greedy sampling
+    # in reality there may be some differences, but overall the outputs should
+    # be very similar.
+    # an example:
+    # prompt: What is the capital of Australia?
+    # both answers should be acceptable.
+    # The capital of Australia is Canberra. It is located in the Australian Capital Territory (ACT) and is home to many
+    # Canberra is the capital of Australia. It is located in the Australian Capital Territory (ACT) and is home to
+    assert len(baseline_outputs) == len(pp_outputs)
+    text_matches = 0
+    text_mismatches = 0
+    logprob_mismatches = 0
+    max_logprob_diff = 0.0
+    for i, (baseline, pp_result) in enumerate(zip(baseline_outputs,
+                                                  pp_outputs)):
+        baseline_text = baseline.outputs[0].text.strip()
+        pp_text = pp_result.outputs[0].text.strip()
+        # Check text output
+        if baseline_text == pp_text:
+            text_matches += 1
+        else:
+            text_mismatches += 1
+            print(f"Text mismatch found in prompt {i}:")
+            print(f"  Baseline: {baseline_text}")
+            print(f"  Pipeline Parallel: {pp_text}")
+        # Check log probabilities
+        baseline_logprobs = baseline.outputs[0].logprobs
+        pp_logprobs = pp_result.outputs[0].logprobs
+        if baseline_logprobs is not None and pp_logprobs is not None:
+            # Compare log probabilities for each token
+            assert len(baseline_logprobs) == len(pp_logprobs), \
+                f"Logprobs length mismatch: {len(baseline_logprobs)} vs {len(pp_logprobs)}"
+            for token_idx, (base_lp, pp_lp) in enumerate(
+                    zip(baseline_logprobs, pp_logprobs)):
+                # Get the top logprob value for the selected token
+                if base_lp and pp_lp:
+                    # Get the top token's logprob from each
+                    base_top_token = list(base_lp.keys())[0]
+                    pp_top_token = list(pp_lp.keys())[0]
+                    base_logprob_val = base_lp[base_top_token].logprob
+                    pp_logprob_val = pp_lp[pp_top_token].logprob
+                    # Calculate absolute difference
+                    diff = abs(base_logprob_val - pp_logprob_val)
+                    max_logprob_diff = max(max_logprob_diff, diff)
+                    # Allow small numerical differences (e.g., 1e-3)
+                    if diff > 1e-3:
+                        logprob_mismatches += 1
+                        print(
+                            f"Logprob mismatch in prompt {i}, token {token_idx}:"
+                        )
+                        print(
+                            f"  Baseline token: {base_top_token}, logprob: {base_logprob_val:.6f}"
+                        )
+                        print(
+                            f"  PP token: {pp_top_token}, logprob: {pp_logprob_val:.6f}"
+                        )
+                        print(f"  Difference: {diff:.6f}")
+    print("✓ Correctness test results:")
+    print(f"  Text: {text_matches} matches, {text_mismatches} mismatches")
+    print(f"  Max logprob difference: {max_logprob_diff:.6e}")
+    print(f"  Significant logprob mismatches (>1e-3): {logprob_mismatches}")
+    # Allow for some variance due to potential numerical differences
+    # but most outputs should match with greedy sampling
+    text_match_rate = text_matches / len(baseline_outputs)
+    assert text_match_rate >= 0.9, f"Text match rate {text_match_rate:.2%} is too low"
+    # Log probabilities should be very close (allow small numerical errors)
+    assert max_logprob_diff < 1, f"Max logprob difference {max_logprob_diff} is too large"

tests/e2e/test_runai_model_streamer_loader.py ADDED Viewed

@@ -0,0 +1,104 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file contains end-to-end tests for the RunAI Model Streamer loader.
+#
+# The RunAI Model Streamer is a high-performance model loader that serves as an
+# alternative to the default Hugging Face loader. Instead of downloading a model
+# to local disk, it streams the weights from object storage (like GCS) into
+# GPU memory. This streaming process is significantly faster than the
+# traditional disk-based loading method.
+# The tests in this file verify that loading model weights using the
+# streamer produces the same results as loading the same model using the
+# standard Hugging Face loader. This ensures the correctness of the streamer
+# integration.
+# The tests are performed by:
+# 1. Loading a model from Google Cloud Storage using the `runai_streamer` format.
+# 2. Generating output with this model.
+# 3. Loading the same model from Hugging Face using the default loader.
+# 4. Generating output with this second model.
+# 5. Asserting that the outputs from both models are identical.
+from __future__ import annotations
+import time
+import pytest
+from vllm import LLM, SamplingParams
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=True)
+@pytest.fixture
+# TODO(amacaskill): Replace with GKE owned GCS bucket.
+def gcs_model_name():
+    return "gs://vertex-model-garden-public-us/llama3/llama3-8b-hf"
+@pytest.fixture
+def hf_model_name():
+    return "meta-llama/Meta-Llama-3-8B"
+@pytest.fixture
+def prompt():
+    return "Hello, my name is"
+def test_correctness(
+    sampling_config: SamplingParams,
+    gcs_model_name: str,
+    hf_model_name: str,
+    prompt: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    '''
+    Compare the outputs of a model loaded from GCS via runai_model_streamer
+    and a model loaded from Hugging Face. The outputs should be the same.
+    These tests attempt to use tensor_parallel_size=1. The model is 16GB,
+    # and v6e has 32GB of HBM, so it will fit.
+    '''
+    # Set ENV variables so that runai_model_streamer uses anonymous GCS access.
+    monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
+    monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
+    monkeypatch.setenv("CLOUD_STORAGE_EMULATOR_ENDPOINT",
+                       "https://storage.googleapis.com")
+    gcs_llm = LLM(model=gcs_model_name,
+                  load_format="runai_streamer",
+                  max_model_len=128,
+                  max_num_seqs=16,
+                  max_num_batched_tokens=256)
+    gcs_outputs = gcs_llm.generate([prompt], sampling_config)
+    gcs_output_text = gcs_outputs[0].outputs[0].text
+    del gcs_llm
+    time.sleep(10)  # Wait for TPUs to be released
+    # Test with Hugging Face model
+    hf_llm = LLM(model=hf_model_name,
+                 max_model_len=128,
+                 max_num_seqs=16,
+                 max_num_batched_tokens=256)
+    hf_outputs = hf_llm.generate([prompt], sampling_config)
+    hf_output_text = hf_outputs[0].outputs[0].text
+    del hf_llm
+    time.sleep(10)  # Wait for TPUs to be released
+    assert gcs_output_text == hf_output_text, (
+        f"Outputs do not match! "
+        f"GCS output: {gcs_output_text}, HF output: {hf_output_text}")

tests/e2e/test_sampling_params.py ADDED Viewed

@@ -0,0 +1,269 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file contains end-to-end tests for sampling parameters.
+#
+# Sampling parameters control how the model selects tokens during generation.
+# These tests verify that temperature, top_p, top_k, and logprobs work correctly.
+#
+# The tests in this file verify that:
+# 1. Temperature=0 produces deterministic (greedy) outputs
+# 2. Higher temperature produces more varied outputs
+# 3. top_p (nucleus sampling) correctly constrains token selection
+# 4. top_k correctly limits the number of candidate tokens
+# 5. logprobs returns probability information for generated tokens
+from __future__ import annotations
+import pytest
+from vllm import LLM, SamplingParams
+@pytest.fixture(scope="module")
+def llm():
+    """Create a shared LLM instance for all tests in this module."""
+    return LLM(
+        model='meta-llama/Llama-3.2-1B-Instruct',
+        max_model_len=1024,
+        max_num_seqs=4,
+        enable_prefix_caching=False,
+    )
+class TestTemperature:
+    """Tests for temperature sampling parameter."""
+    def test_temperature_zero_is_deterministic(self, llm: LLM):
+        """Temperature=0 should produce identical outputs across multiple runs."""
+        prompt = "What is 2 + 2? Answer with just the number:"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        outputs1 = llm.generate([prompt], sampling_params)
+        outputs2 = llm.generate([prompt], sampling_params)
+        assert outputs1[0].outputs[0].text == outputs2[0].outputs[0].text
+    def test_high_temperature_produces_variation(self, llm: LLM):
+        """High temperature should produce varied outputs across multiple runs."""
+        prompt = "Write a random word:"
+        sampling_params = SamplingParams(temperature=2,
+                                         max_tokens=10,
+                                         top_k=4096)
+        # Run multiple times and collect unique outputs
+        unique_outputs = set()
+        num_runs = 10
+        for _ in range(num_runs):
+            outputs = llm.generate([prompt], sampling_params)
+            unique_outputs.add(outputs[0].outputs[0].text)
+        # With high temperature, we expect some variation
+        assert len(unique_outputs) > 1, (
+            "High temperature should produce varied outputs")
+class TestTopP:
+    """Tests for top_p (nucleus sampling) parameter."""
+    def test_top_p_restricts_sampling(self, llm: LLM):
+        """top_p=1.0 vs lower values should affect output diversity."""
+        prompt = "Name a color:"
+        # With top_p=1.0 (consider all tokens)
+        sampling_params_full = SamplingParams(temperature=0.8,
+                                              top_p=1.0,
+                                              max_tokens=5)
+        # With top_p=0.1 (very restrictive, only top tokens)
+        sampling_params_restricted = SamplingParams(temperature=0.8,
+                                                    top_p=0.1,
+                                                    max_tokens=5)
+        # Collect outputs with full nucleus
+        full_outputs = set()
+        for _ in range(10):
+            outputs = llm.generate([prompt], sampling_params_full)
+            full_outputs.add(outputs[0].outputs[0].text)
+        # Collect outputs with restricted nucleus
+        restricted_outputs = set()
+        for _ in range(10):
+            outputs = llm.generate([prompt], sampling_params_restricted)
+            restricted_outputs.add(outputs[0].outputs[0].text)
+        # Restricted top_p should generally produce less variety
+        # (though this isn't guaranteed, it's a reasonable expectation)
+        assert len(
+            restricted_outputs) >= 1, "Should produce at least one output"
+        assert len(full_outputs) >= 1, "Should produce at least one output"
+    def test_top_p_with_temperature_zero(self, llm: LLM):
+        """top_p should have no effect when temperature=0 (greedy)."""
+        prompt = "The capital of France is"
+        sampling_params_1 = SamplingParams(temperature=0,
+                                           top_p=0.1,
+                                           max_tokens=10)
+        sampling_params_2 = SamplingParams(temperature=0,
+                                           top_p=0.9,
+                                           max_tokens=10)
+        outputs1 = llm.generate([prompt], sampling_params_1)
+        outputs2 = llm.generate([prompt], sampling_params_2)
+        # Both should produce identical outputs since temperature=0
+        assert outputs1[0].outputs[0].text == outputs2[0].outputs[0].text
+class TestTopK:
+    """Tests for top_k sampling parameter."""
+    def test_top_k_restricts_sampling(self, llm: LLM):
+        """top_k should limit the candidate tokens for sampling."""
+        prompt = "Pick a number between 1 and 10:"
+        # top_k=1 is equivalent to greedy (always pick the most likely)
+        sampling_params_k1 = SamplingParams(temperature=1.0,
+                                            top_k=1,
+                                            max_tokens=5)
+        # top_k=-1 considers all tokens
+        sampling_params_all = SamplingParams(temperature=1.0,
+                                             top_k=-1,
+                                             max_tokens=5)
+        # With top_k=1, outputs should be deterministic
+        outputs_k1_run1 = llm.generate([prompt], sampling_params_k1)
+        outputs_k1_run2 = llm.generate([prompt], sampling_params_k1)
+        assert outputs_k1_run1[0].outputs[0].text == outputs_k1_run2[
+            0].outputs[0].text
+        # With top_k=-1 and temperature=1.0, we may see variation
+        all_outputs = set()
+        for _ in range(10):
+            outputs = llm.generate([prompt], sampling_params_all)
+            all_outputs.add(outputs[0].outputs[0].text)
+        # Should produce at least one valid output
+        assert len(all_outputs) >= 1
+    def test_top_k_with_temperature_zero(self, llm: LLM):
+        """top_k should have no effect when temperature=0 (greedy)."""
+        prompt = "The largest planet is"
+        sampling_params_k5 = SamplingParams(temperature=0,
+                                            top_k=5,
+                                            max_tokens=10)
+        sampling_params_k50 = SamplingParams(temperature=0,
+                                             top_k=50,
+                                             max_tokens=10)
+        outputs1 = llm.generate([prompt], sampling_params_k5)
+        outputs2 = llm.generate([prompt], sampling_params_k50)
+        # Both should produce identical outputs since temperature=0
+        assert outputs1[0].outputs[0].text == outputs2[0].outputs[0].text
+class TestLogprobs:
+    """Tests for logprobs parameter."""
+    def test_logprobs_returns_probabilities(self, llm: LLM):
+        """logprobs parameter should return log probabilities for tokens."""
+        prompt = "Hello"
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=5,
+                                         logprobs=5)
+        outputs = llm.generate([prompt], sampling_params)
+        output = outputs[0].outputs[0]
+        # Check that logprobs are returned
+        assert output.logprobs is not None, "logprobs should be returned"
+        assert len(output.logprobs) > 0, "logprobs should contain entries"
+        # Each token should have logprob information
+        for token_logprobs in output.logprobs:
+            assert token_logprobs is not None
+            # Should have up to 5 top logprobs as requested
+            assert len(token_logprobs) <= 5
+    def test_logprobs_none_returns_no_probabilities(self, llm: LLM):
+        """When logprobs=None, no log probabilities should be returned."""
+        prompt = "Hello"
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=5,
+                                         logprobs=None)
+        outputs = llm.generate([prompt], sampling_params)
+        output = outputs[0].outputs[0]
+        # logprobs should be None when not requested
+        assert output.logprobs is None, "logprobs should be None when not requested"
+    def test_logprobs_values_are_valid(self, llm: LLM):
+        """Log probabilities should be valid (negative or zero)."""
+        prompt = "The sky is"
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=3,
+                                         logprobs=3)
+        outputs = llm.generate([prompt], sampling_params)
+        output = outputs[0].outputs[0]
+        assert output.logprobs is not None
+        for token_logprobs in output.logprobs:
+            for token_id, logprob_obj in token_logprobs.items():
+                # Log probabilities should be <= 0
+                assert logprob_obj.logprob <= 0, (
+                    f"Log probability should be <= 0, got {logprob_obj.logprob}"
+                )
+class TestCombinedParameters:
+    """Tests for combinations of sampling parameters."""
+    def test_top_p_and_top_k_combined(self, llm: LLM):
+        """top_p and top_k can be used together."""
+        prompt = "List a fruit:"
+        sampling_params = SamplingParams(
+            temperature=0.7,
+            top_p=0.9,
+            top_k=50,
+            max_tokens=10,
+        )
+        outputs = llm.generate([prompt], sampling_params)
+        assert len(outputs[0].outputs[0].text) > 0
+    def test_all_params_with_logprobs(self, llm: LLM):
+        """All sampling parameters should work together with logprobs."""
+        prompt = "Complete this sentence: The weather today is"
+        sampling_params = SamplingParams(
+            temperature=0.5,
+            top_p=0.95,
+            top_k=40,
+            max_tokens=10,
+            logprobs=3,
+        )
+        outputs = llm.generate([prompt], sampling_params)
+        output = outputs[0].outputs[0]
+        # Should have generated text
+        assert len(output.text) > 0
+        # Should have logprobs
+        assert output.logprobs is not None
+        assert len(output.logprobs) > 0