PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tests/layers/vllm/test_unquantized.py ADDED Viewed

@@ -0,0 +1,621 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import jax
+import pytest
+import torch
+import torchax
+from jax._src import test_util as jtu
+from jax.sharding import NamedSharding, PartitionSpec
+from torchax.interop import torch_view
+from torchax.ops.mappings import j2t, t2j
+from vllm.config import ParallelConfig, set_current_vllm_config
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.model_loader import get_model as vllm_get_model
+from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
+from tpu_inference.layers.vllm.quantization.unquantized import (
+    VllmUnquantizedConfig, VllmUnquantizedFusedMoEMethod,
+    VllmUnquantizedLinearMethod)
+from . import utils as test_utils
+P = PartitionSpec
+MODELS = ["Qwen/Qwen2-1.5B-Instruct"]
+@pytest.fixture(autouse=True)
+def setup_environment():
+    # This is a fake config used for init dist env.
+    # RowParallelLinear needs dist env to be initialized.
+    engine_args = EngineArgs(
+        model=MODELS[0],
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            1,
+            0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+            backend="gloo")
+        ensure_model_parallel_initialized(1, 1)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+def test_quant_override(model, mesh):
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.model_config.dtype = torch.bfloat16
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    assert isinstance(quant_config, VllmUnquantizedConfig)
+    assert quant_config.vllm_config == vllm_config
+    assert quant_config.mesh == mesh
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+def test_loading_model(model, mesh):
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.model_config.dtype = torch.bfloat16
+    vllm_config.quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    vllm_config.device_config.device = "cpu"
+    vllm_model = vllm_get_model(vllm_config=vllm_config)
+    layers = test_utils.find_all_layer_type(vllm_model, LinearBase)
+    for layer in layers:
+        assert isinstance(layer.quant_config, VllmUnquantizedConfig)
+        assert isinstance(layer.quant_method, VllmUnquantizedLinearMethod)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+@pytest.mark.parametrize("enable_sp", [False, True])
+def test_row_parallel_linear(model, bias, mesh, enable_sp):
+    dtype = torch.bfloat16
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
+    with set_current_vllm_config(vllm_config):
+        row_linear = RowParallelLinear(
+            input_size=4096,
+            output_size=8192,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+        )
+    weight_data = torch.rand_like(row_linear.weight.data) / 10
+    if bias:
+        bias_data = torch.rand_like(row_linear.bias.data)
+    row_linear.weight.data = weight_data
+    if bias:
+        row_linear.bias.data = bias_data
+    row_linear = row_linear.to('cpu')
+    row_linear.quant_method.process_weights_after_loading(row_linear)
+    output = row_linear(input_tensor).to(dtype)
+    vllm_config.model_config.dtype = dtype
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        jax_row_linear = RowParallelLinear(
+            input_size=4096,
+            output_size=8192,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+            quant_config=quant_config,
+        )
+    jax_row_linear.weight.data = weight_data
+    if bias:
+        jax_row_linear.bias.data = bias_data
+    jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
+    jax_input_tensor.apply_jax_(jax.device_put,
+                                NamedSharding(mesh, P(None, None)))
+    with torchax.default_env():
+        assert isinstance(jax_row_linear.quant_method,
+                          VllmUnquantizedLinearMethod)
+        jax_row_linear.quant_method.process_weights_after_loading(
+            jax_row_linear)
+        jax_output = jax_row_linear(jax_input_tensor)
+        # j2t() doens't support bfloat16, so we cast it into float32 as an intermedate step.
+        jax_output = j2t(jax_output.to(torch.float32)).to(dtype)
+    torch.testing.assert_close(output, jax_output)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+@pytest.mark.parametrize("enable_sp", [False, True])
+def test_column_parallel_linear(model, bias, mesh, enable_sp):
+    dtype = torch.bfloat16
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
+    with set_current_vllm_config(vllm_config):
+        column_linear = ColumnParallelLinear(
+            input_size=4096,
+            output_size=8192,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+        )
+    weight_data = torch.rand_like(column_linear.weight.data) / 10
+    if bias:
+        bias_data = torch.rand_like(column_linear.bias.data)
+    column_linear.weight.data = weight_data
+    if bias:
+        column_linear.bias.data = bias_data
+    column_linear = column_linear.to('cpu')
+    column_linear.quant_method.process_weights_after_loading(column_linear)
+    output = column_linear(input_tensor).to(dtype)
+    vllm_config.model_config.dtype = dtype
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        jax_column_linear = ColumnParallelLinear(
+            input_size=4096,
+            output_size=8192,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+            quant_config=quant_config,
+        )
+    jax_column_linear.weight.data = weight_data
+    if bias:
+        jax_column_linear.bias.data = bias_data
+    jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
+    jax_input_tensor.apply_jax_(jax.device_put,
+                                NamedSharding(mesh, P(None, None)))
+    with torchax.default_env():
+        assert isinstance(jax_column_linear.quant_method,
+                          VllmUnquantizedLinearMethod)
+        jax_column_linear.quant_method.process_weights_after_loading(
+            jax_column_linear)
+        jax_output = jax_column_linear(jax_input_tensor)
+        jax_output = j2t(jax_output.to(torch.float32)).to(dtype)
+    torch.testing.assert_close(output, jax_output)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+@pytest.mark.parametrize("enable_sp", [False, True])
+@pytest.mark.parametrize("fuse_matmuls", [False, True])
+def test_qkv_parallel_linear(model, bias, mesh, enable_sp, fuse_matmuls):
+    dtype = torch.bfloat16
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
+    with set_current_vllm_config(vllm_config):
+        qkv_linear = QKVParallelLinear(
+            hidden_size=4096,
+            head_size=128,
+            total_num_heads=32,
+            total_num_kv_heads=8,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+        )
+    weight_data = torch.rand_like(qkv_linear.weight.data) / 10
+    if bias:
+        bias_data = torch.rand_like(qkv_linear.bias.data)
+    qkv_linear.weight.data = weight_data
+    if bias:
+        qkv_linear.bias.data = bias_data
+    qkv_linear = qkv_linear.to('cpu')
+    qkv_linear.quant_method.process_weights_after_loading(qkv_linear)
+    output = qkv_linear(input_tensor).to(dtype)
+    vllm_config.model_config.dtype = dtype
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        jax_qkv_linear = QKVParallelLinear(
+            hidden_size=4096,
+            head_size=128,
+            total_num_heads=32,
+            total_num_kv_heads=8,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+            quant_config=quant_config,
+        )
+        jax_qkv_linear.quant_method.fuse_matmuls = fuse_matmuls
+    jax_qkv_linear.weight.data = weight_data
+    if bias:
+        jax_qkv_linear.bias.data = bias_data
+    jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
+    jax_input_tensor.apply_jax_(jax.device_put,
+                                NamedSharding(mesh, P(None, None)))
+    with torchax.default_env():
+        assert isinstance(jax_qkv_linear.quant_method,
+                          VllmUnquantizedLinearMethod)
+        jax_qkv_linear.quant_method.process_weights_after_loading(
+            jax_qkv_linear)
+        jax_output = jax_qkv_linear(jax_input_tensor)
+        jax_output = j2t(jax_output.to(torch.float32)).to(dtype)
+    torch.testing.assert_close(output, jax_output)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+@pytest.mark.parametrize("fuse_matmuls", [False, True])
+@pytest.mark.parametrize("enable_sp", [False, True])
+def test_merged_column_parallel_linear(model, bias, mesh, fuse_matmuls,
+                                       enable_sp):
+    dtype = torch.bfloat16
+    engine_args = EngineArgs(
+        model=model,
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.compilation_config.pass_config.enable_sp = enable_sp
+    input_tensor = torch.rand(10, 4096, dtype=dtype) / 10
+    input_tensor = input_tensor.to('cpu')
+    # Call vLLM code
+    with set_current_vllm_config(vllm_config):
+        merged_column_linear = MergedColumnParallelLinear(
+            input_size=4096,
+            output_sizes=[14336] * 2,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+        )
+    weight_data = torch.rand_like(merged_column_linear.weight.data) / 10
+    if bias:
+        bias_data = torch.rand_like(merged_column_linear.bias.data)
+    merged_column_linear.weight.data = weight_data
+    if bias:
+        merged_column_linear.bias.data = bias_data
+    merged_column_linear = merged_column_linear.to('cpu')
+    merged_column_linear.quant_method.process_weights_after_loading(
+        merged_column_linear)
+    output = merged_column_linear(input_tensor).to(dtype)
+    # Call tpu_inference code
+    vllm_config.model_config.dtype = dtype
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        jax_merged_column_linear = MergedColumnParallelLinear(
+            input_size=4096,
+            output_sizes=[14336] * 2,
+            bias=bias,
+            params_dtype=dtype,
+            return_bias=False,
+            quant_config=quant_config,
+        )
+        jax_merged_column_linear.quant_method.fuse_matmuls = fuse_matmuls
+    jax_merged_column_linear.weight.data = weight_data
+    if bias:
+        jax_merged_column_linear.bias.data = bias_data
+    jax_input_tensor = torch_view(t2j(input_tensor, use_dlpack=False))
+    jax_input_tensor.apply_jax_(jax.device_put,
+                                NamedSharding(mesh, P(None, None)))
+    with torchax.default_env():
+        assert isinstance(jax_merged_column_linear.quant_method,
+                          VllmUnquantizedLinearMethod)
+        jax_merged_column_linear.quant_method.process_weights_after_loading(
+            jax_merged_column_linear)
+        jax_output = jax_merged_column_linear(jax_input_tensor)
+        jax_output = j2t(jax_output.to(torch.float32)).to(dtype)
+    torch.testing.assert_close(output, jax_output)
+@pytest.mark.parametrize("use_ep", [True, False])
+@pytest.mark.parametrize("mesh", [
+    test_utils.get_spmd_mesh(1),
+    test_utils.get_spmd_mesh(jax.local_device_count())
+])
+@pytest.mark.parametrize("num_tokens", [8])
+@pytest.mark.parametrize("intermediate_size", [1024, 2048])
+@pytest.mark.parametrize("hidden_size", [128, 512])
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("topk", [2])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("activation", ["silu", "swigluoai"])
+def test_fused_moe(use_ep, mesh, num_tokens, intermediate_size, hidden_size,
+                   num_experts, topk, has_bias, activation):
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+    a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
+    w1 = torch.randn(
+        (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
+    w2 = torch.randn(
+        (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
+    score = torch.randn((num_tokens, num_experts), dtype=dtype)
+    w1_bias = w2_bias = None
+    if has_bias:
+        w1_bias = torch.randn(
+            (num_experts, 2 * intermediate_size), dtype=dtype) / 10
+        w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
+    engine_args = EngineArgs(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.model_config.dtype = dtype
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        vllm_fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=topk,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            renormalize=False,
+            tp_size=1,
+            dp_size=1,
+            quant_config=quant_config,
+            has_bias=has_bias,
+            activation=activation,
+        )
+        vllm_fused_moe.moe_parallel_config.use_ep = use_ep
+    vllm_fused_moe.w13_weight.data = w1
+    vllm_fused_moe.w2_weight.data = w2
+    if has_bias:
+        vllm_fused_moe.w13_bias.data = w1_bias
+        vllm_fused_moe.w2_bias.data = w2_bias
+    expected = test_utils.ref_moe(a, score, w1, w2, w1_bias, w2_bias,
+                                  vllm_fused_moe.top_k,
+                                  vllm_fused_moe.renormalize,
+                                  vllm_fused_moe.activation)
+    with torchax.default_env(), set_forward_context(None, vllm_config):
+        assert isinstance(vllm_fused_moe.quant_method,
+                          VllmUnquantizedFusedMoEMethod)
+        jax_a = a.to('jax')
+        score = score.to('jax')
+        vllm_fused_moe.quant_method.process_weights_after_loading(
+            vllm_fused_moe)
+        actual = vllm_fused_moe(jax_a, score)
+        torch.testing.assert_close(expected,
+                                   actual,
+                                   check_device=False,
+                                   atol=1e-1,
+                                   rtol=1e-1)
+@pytest.mark.parametrize("mesh",
+                         [test_utils.get_spmd_mesh(jax.local_device_count())])
+@pytest.mark.parametrize("num_tokens", [128, 512])
+@pytest.mark.parametrize("intermediate_size", [512])
+@pytest.mark.parametrize("hidden_size", [512])
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("topk", [8])
+@pytest.mark.parametrize("has_bias", [False, True])
+def test_fused_moe_use_kernel(mesh, num_tokens, intermediate_size, hidden_size,
+                              num_experts, topk, has_bias):
+    # TODO(Qiliang Cui): Remove when issue is resolved.
+    if not jtu.is_device_tpu_at_least(version=7):
+        pytest.skip(allow_module_level=True, reason="Expected TPUv7+")
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+    a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
+    w1 = torch.randn(
+        (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
+    w2 = torch.randn(
+        (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
+    w1_bias = w2_bias = None
+    if has_bias:
+        w1_bias = torch.randn(
+            (num_experts, 2 * intermediate_size), dtype=dtype) / 10
+        w2_bias = torch.randn((num_experts, hidden_size), dtype=dtype) / 10
+    # Use deterministic gating_output generation (same logic as fused_moe_v1_test.py)
+    # Generate base gating scores with deterministic pattern
+    score = (
+        torch.randn((num_tokens, num_experts), dtype=torch.float32) +
+        torch.arange(num_tokens * num_experts, dtype=torch.float32).reshape(
+            num_tokens, num_experts) / 100)
+    # Generate unique top-k indices
+    generator = torch.Generator()
+    generator.manual_seed(42)
+    top_k_indices = torch.randint(0,
+                                  num_experts - 1, (num_tokens, topk),
+                                  dtype=torch.int32,
+                                  generator=generator)
+    # Add one-hot encoding weighted by 10 to ensure selected experts have highest scores
+    one_hot = torch.nn.functional.one_hot(top_k_indices.long(),
+                                          num_classes=num_experts).float()
+    one_hot = one_hot.sum(dim=1) * 10
+    score = (score + one_hot).to(dtype)
+    engine_args = EngineArgs(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_model_len=64,
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+    )
+    vllm_config = engine_args.create_engine_config()
+    vllm_config.model_config.dtype = dtype
+    vllm_config.parallel_config = ParallelConfig(
+        tensor_parallel_size=mesh.devices.size)
+    quant_config = get_tpu_quantization_config(vllm_config, mesh)
+    with set_current_vllm_config(vllm_config):
+        vllm_fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=topk,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            renormalize=False,
+            tp_size=mesh.devices.size,
+            dp_size=1,
+            quant_config=quant_config,
+            has_bias=has_bias,
+        )
+        vllm_fused_moe.moe_parallel_config.use_ep = True
+        vllm_fused_moe.quant_method.use_kernel = True
+    vllm_fused_moe.w13_weight.data = w1
+    vllm_fused_moe.w2_weight.data = w2
+    if has_bias:
+        vllm_fused_moe.w13_bias.data = w1_bias
+        vllm_fused_moe.w2_bias.data = w2_bias
+    expected = test_utils.ref_moe(a, score, w1, w2, w1_bias, w2_bias,
+                                  vllm_fused_moe.top_k,
+                                  vllm_fused_moe.renormalize,
+                                  vllm_fused_moe.activation)
+    with torchax.default_env(), set_forward_context(None, vllm_config):
+        assert isinstance(vllm_fused_moe.quant_method,
+                          VllmUnquantizedFusedMoEMethod)
+        jax_a = a.to('jax')
+        score = score.to('jax')
+        vllm_fused_moe.quant_method.process_weights_after_loading(
+            vllm_fused_moe)
+        vllm_fused_moe.quant_method.block_size = {
+            "bt": 32,
+            "bf": 512,
+            "bd1": 512,
+            "bd2": 512,
+            "btc": 32,
+            "bfc": 256,
+            "bd1c": 256,
+            "bd2c": 256,
+        }
+        actual = vllm_fused_moe(jax_a, score)
+        torch.testing.assert_close(
+            expected,
+            actual,
+            check_device=False,
+            atol=1e-2,
+            rtol=1e-2,
+        )

tests/layers/vllm/utils.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import jax
+import torch
+import torch.nn.functional as F
+def get_spmd_mesh(num_devices: int = 1):
+    axis_names = ("data", "model")
+    devices = sorted(jax.devices(), key=lambda d: d.id)[0:num_devices]
+    mesh_shape = (1, len(devices))
+    return jax.make_mesh(mesh_shape, axis_names, devices=devices)
+def find_all_layer_type(module: torch.nn.Module, layer_type: torch.nn.Module):
+    ret = []
+    for name, child in module.named_children():
+        if isinstance(child, layer_type):
+            ret.append(child)
+        else:
+            ret.extend(find_all_layer_type(child, layer_type))
+    return ret
+# TODO(kyuyeunk): Consolidate all reference implementation used for unit tests
+# into a single file.
+def ref_moe(x, router_logits, w1, w2, w1_bias, w2_bias, top_k, renormalize,
+            activation):
+    expert_weights = F.softmax(router_logits, dim=-1)
+    expert_weights, expert_indices = torch.topk(expert_weights, top_k, dim=-1)
+    if renormalize:
+        expert_weights /= expert_weights.sum(dim=-1, keepdim=True)
+    x = torch.einsum("ti,eoi->teo", x, w1)
+    if w1_bias is not None:
+        x += w1_bias.unsqueeze(0)
+    match activation:
+        case "silu":
+            x1, x3 = x.chunk(chunks=2, dim=-1)
+            x = F.silu(x1) * x3
+        case "swigluoai":
+            x1, x3 = x[..., ::2], x[..., 1::2]
+            x1 = x1.clamp(min=None, max=7.0)
+            x3 = x3.clamp(min=-7.0, max=7.0)
+            gated_activation = x1 * torch.sigmoid(x1 * 1.702)
+            x = gated_activation * (x3 + 1)
+        case _:
+            raise NotImplementedError(
+                f"No reference implementation for {activation} activation")
+    x = torch.einsum("teo,eio->tei", x, w2)
+    if w2_bias is not None:
+        x += w2_bias.unsqueeze(0)
+    seq_indexes = torch.arange(x.shape[0]).unsqueeze(1)
+    x = x[seq_indexes, expert_indices]
+    return torch.einsum("tai,ta->ti", x, expert_weights)

tests/lora/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.