PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.2rc3__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.2rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +89 -26
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +46 -17
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +44 -17
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/METADATA +7 -9
tpu_inference-0.13.2rc3.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/top_level.txt +0 -0

tests/e2e/test_structured_decoding.py ADDED Viewed

@@ -0,0 +1,46 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file contains end-to-end tests for structured decoding.
+#
+# Structured decoding allows constraining the model's output to follow a
+# specific format, such as choosing from a predefined set of options or
+# following a JSON schema. This is useful for classification tasks,
+# structured data extraction, and ensuring outputs conform to expected formats.
+# The tests in this file verify that:
+# 1. Choice-based structured decoding correctly constrains output to valid options
+# 2. The model produces deterministic results when given structured constraints
+from __future__ import annotations
+from vllm import LLM, SamplingParams
+from vllm.sampling_params import StructuredOutputsParams
+def test_structured_decoding():
+    llm = LLM(model='meta-llama/Llama-3.2-1B-Instruct',
+              max_model_len=1024,
+              max_num_seqs=1,
+              enable_prefix_caching=False)
+    choices = ['Positive', 'Negative']
+    structured_outputs_params = StructuredOutputsParams(choice=choices)
+    sampling_params = SamplingParams(
+        structured_outputs=structured_outputs_params)
+    outputs = llm.generate(
+        prompts="Classify this sentiment: tpu-inference is wonderful!",
+        sampling_params=sampling_params,
+    )
+    assert outputs[0].outputs[0].text in choices

tests/executors/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tests/executors/test_ray_distributed_executor.py ADDED Viewed

@@ -0,0 +1,199 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest.mock import MagicMock, patch
+# Mock VllmConfig and its nested configs to avoid dependencies on the actual
+# classes, which can be complex to instantiate for testing.
+class MockVllmConfig:
+    def __init__(self):
+        self.parallel_config = MagicMock()
+        self.parallel_config.world_size = 4
+        self.parallel_config.tensor_parallel_size = 2
+        self.parallel_config.pipeline_parallel_size = 1
+        self.parallel_config.ray_workers_use_nsight = False
+        self.parallel_config.placement_group = None
+        self.parallel_config.max_parallel_loading_workers = None
+        self.sharding_config = MagicMock()
+        self.sharding_config.total_devices = 2
+        self.model_config = MagicMock()
+        self.cache_config = MagicMock()
+        self.lora_config = MagicMock()
+        self.load_config = MagicMock()
+        self.scheduler_config = MagicMock()
+        self.speculative_config = MagicMock()
+        self.prompt_adapter_config = MagicMock()
+        self.observability_config = MagicMock()
+        self.device_config = MagicMock()
+        self.ec_transfer_config = MagicMock()
+@patch(
+    "vllm.v1.executor.ray_distributed_executor.RayDistributedExecutor.__init__",
+    lambda x, y: None)
+@patch("tpu_inference.executors.ray_distributed_executor.envs")
+@patch("tpu_inference.executors.ray_distributed_executor.ray")
+@patch("tpu_inference.executors.ray_distributed_executor.current_platform")
+@patch("tpu_inference.executors.ray_distributed_executor.get_ip",
+       return_value="127.0.0.1")
+@patch("tpu_inference.executors.ray_distributed_executor.get_open_port",
+       return_value=12345)
+@patch(
+    "tpu_inference.executors.ray_distributed_executor.available_resources_per_node"
+)
+@patch("tpu_inference.executors.ray_distributed_executor._wait_until_pg_ready")
+class TestTpuRayDistributedExecutor(unittest.TestCase):
+    def setUp(self):
+        # Import the class under test inside the test method to ensure
+        # patches are applied.
+        from tpu_inference.executors.ray_distributed_executor import \
+            RayDistributedExecutor
+        self.RayDistributedExecutor = RayDistributedExecutor
+        self.vllm_config = MockVllmConfig()
+        # Reset placement group for each test as it might be modified.
+        self.vllm_config.parallel_config.placement_group = None
+        self.vllm_config.kv_transfer_config = None
+    def test_init_executor_basic_flow(self, mock_wait_until_pg_ready,
+                                      mock_avail_resources, mock_get_port,
+                                      mock_get_ip, mock_platform, mock_ray,
+                                      mock_envs):
+        # --- Setup mocks ---
+        mock_envs.VLLM_USE_RAY_COMPILED_DAG = True
+        mock_envs.VLLM_USE_RAY_SPMD_WORKER = True
+        mock_envs.VLLM_RAY_BUNDLE_INDICES = ""
+        mock_platform.ray_device_key = "TPU"
+        mock_platform.device_name = "tpu"
+        mock_platform.device_control_env_var = "TPU_VISIBLE_CHIPS"
+        mock_platform.additional_env_vars = []
+        mock_ray.is_initialized.return_value = False
+        mock_ray.nodes.return_value = [{"Resources": {"TPU": 4}}]
+        mock_ray.get_runtime_context.return_value.get_node_id.return_value = "node_1"
+        mock_avail_resources.return_value = {"node_1": {"TPU": 4}}
+        mock_wait_until_pg_ready.return_value = None
+        mock_placement_group = MagicMock()
+        mock_placement_group.bundle_specs = [{"TPU": 1}] * 4
+        mock_ray.util.placement_group.return_value = mock_placement_group
+        mock_worker = MagicMock()
+        mock_worker.get_node_and_gpu_ids.remote.return_value = [("node_1",
+                                                                 [0, 1, 2, 3])]
+        mock_ray.remote.return_value.remote.return_value = mock_worker
+        # Simulate remote calls on the worker
+        mock_ray.get.side_effect = [
+            ["127.0.0.1"] * 4,  # worker_ips
+            *[("node_1", [i]) for i in range(4)]  # worker_node_and_tpu_ids
+        ]
+        executor = self.RayDistributedExecutor(self.vllm_config)
+        # Members of the parent class
+        executor.uses_ray = True
+        executor.vllm_config = self.vllm_config
+        executor.parallel_config = self.vllm_config.parallel_config
+        executor.collective_rpc = MagicMock()
+        executor.collective_rpc.return_value = None
+        # --- Initialization ---
+        executor._init_executor()
+        # --- Assertions ---
+        mock_ray.init.assert_called_once()
+        self.assertIsNotNone(executor.parallel_config.placement_group)
+        self.assertEqual(len(executor.workers), 4)
+    def test_initialize_ray_cluster_no_tpu_on_driver_raises_error(
+            self, mock_wait_until_pg_ready, mock_avail_resources,
+            mock_get_port, mock_get_ip, mock_platform, mock_ray, mock_envs):
+        # --- Setup Mocks ---
+        mock_platform.ray_device_key = "TPU"
+        mock_platform.device_name = "tpu"
+        mock_ray.is_initialized.return_value = False
+        mock_ray.nodes.return_value = [{"Resources": {"TPU": 4}}]
+        mock_ray.get_runtime_context.return_value.get_node_id.return_value = "driver_node"
+        # Simulate no TPUs on the driver node
+        mock_avail_resources.return_value = {
+            "driver_node": {
+                "CPU": 8
+            },
+            "worker_node": {
+                "TPU": 4
+            }
+        }
+        executor = self.RayDistributedExecutor(self.vllm_config)
+        executor.vllm_config = self.vllm_config
+        executor.parallel_config = self.vllm_config.parallel_config
+        # --- Test and Assert ---
+        with self.assertRaisesRegex(ValueError,
+                                    "Current node has no TPU available"):
+            executor._initialize_ray_cluster()
+    def test_init_workers_ray_sorts_correctly(self, mock_wait_until_pg_ready,
+                                              mock_avail_resources,
+                                              mock_get_port, mock_get_ip,
+                                              mock_platform, mock_ray,
+                                              mock_envs):
+        # --- Setup Mocks ---
+        mock_envs.VLLM_RAY_BUNDLE_INDICES = ""
+        mock_platform.ray_device_key = "TPU"
+        mock_get_ip.return_value = "10.0.0.1"  # Driver IP
+        mock_pg = MagicMock()
+        mock_pg.bundle_specs = [{"TPU": 1}] * 4
+        mock_workers = [MagicMock() for _ in range(4)]
+        mock_ray.remote.return_value.return_value.remote.side_effect = mock_workers
+        # Simulate IPs for workers created with ranks 0, 1, 2, 3
+        worker_ips = ["10.0.0.2", "10.0.0.3", "10.0.0.1", "10.0.0.4"]
+        mock_ray.get.side_effect = [
+            worker_ips,  # worker_ips
+            *[('node_1', ['0', '1', '2', '3']),
+              ('node_2', ['4', '5', '6', '7']),
+              ('node_3', ['8', '9', '10', '11']),
+              ('node_4', ['12', '13', '14', '15'])]  # worker_node_and_tpu_ids
+        ]
+        executor = self.RayDistributedExecutor(self.vllm_config)
+        executor.use_ray_spmd_worker = True
+        executor.parallel_config = self.vllm_config.parallel_config
+        executor.vllm_config = self.vllm_config
+        executor.parallel_config.ray_workers_use_nsight = False
+        executor.collective_rpc = MagicMock()
+        executor.collective_rpc.return_value = None
+        # --- Call method under test ---
+        executor._init_workers_ray(mock_pg)
+        # --- Assertions ---
+        # Expected sorted order of workers: driver, then by IP
+        # Original workers: 0 (10.0.0.2), 1 (10.0.0.3), 2 (10.0.0.1), 3 (10.0.0.2)
+        # Sorted workers: 2 (driver), 0, 3 (same IP), 1
+        self.assertEqual(executor.workers, [
+            mock_workers[2], mock_workers[0], mock_workers[1], mock_workers[3]
+        ])

tests/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tests/experimental/test_llama3_jax_stashed.py ADDED Viewed

@@ -0,0 +1,208 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+from flax import nnx
+from flax.typing import PRNGKey
+from jax.sharding import Mesh
+from tpu_inference.experimental.llama3_jax_stashed import (Llama3WeightLoader,
+                                                           LlamaForCausalLM)
+class MockParam:
+    """A mock for a parameter used in the Llama model."""
+    def __init__(self, shape=(32, 128)):
+        self.value = SimpleNamespace(shape=shape)
+        # The sharding spec is accessed during weight loading
+        self.sharding = SimpleNamespace(spec=None)
+    # Allow the mock parameter's value to be updated
+    def __setattr__(self, name, value):
+        if name == "value":
+            self.__dict__[name] = value
+        else:
+            super().__setattr__(name, value)
+class MockVllmConfig:
+    """A mock VllmConfig sufficient for testing the Llama3 model."""
+    def __init__(self,
+                 model_name: str,
+                 random_weights: bool = False,
+                 tensor_parallelism: int = 1):
+        self.model_config = SimpleNamespace(model=model_name,
+                                            dtype="bfloat16",
+                                            hf_overrides={},
+                                            override_generation_config={})
+        self.load_config = MagicMock()
+        self.additional_config = {
+            "random_weights": random_weights,
+            "sharding": {
+                "sharding_strategy": {
+                    "tensor_parallelism": tensor_parallelism
+                }
+            }
+        }
+        # NOTE (jacobplatin): we could add a quantized KV cache test, but
+        # we'll skip it for now.
+        self.cache_config = MagicMock(cache_dtype="auto")
+@pytest.fixture(scope="module")
+def mesh():
+    """
+    Creates a mesh with all required axes for testing.
+    FIX: The sharding logic expects 'data', 'model', and 'expert' axes.
+    This creates a 3D mesh to satisfy the sharding rules, even on a single device.
+    """
+    if not jax.devices():
+        pytest.skip("No JAX devices available for mesh creation.")
+    devices = np.array(jax.local_devices())
+    # Reshape devices into a 3D array to name 3 axes: data, model, and expert.
+    # The 'model' and 'expert' axes will have a size of 1.
+    num_devices = len(devices)
+    device_mesh = devices.reshape((num_devices, 1, 1))
+    with Mesh(device_mesh, axis_names=('data', 'model', 'expert')) as m:
+        yield m
+@pytest.fixture
+def rng() -> PRNGKey:
+    """Provides a reusable JAX PRNGKey."""
+    return jax.random.PRNGKey(42)
+@pytest.fixture
+def mock_vllm_config_8b() -> MockVllmConfig:
+    return MockVllmConfig(model_name="meta-llama/Llama-3-8B")
+@pytest.fixture
+def mock_vllm_config_70b() -> MockVllmConfig:
+    return MockVllmConfig(model_name="meta-llama/Llama-3-70B-Instruct")
+@pytest.fixture
+def mock_vllm_config_unknown() -> MockVllmConfig:
+    return MockVllmConfig(model_name="some-other-model")
+# --- Test Cases ---
+class TestLlamaForCausalLM:
+    """Tests for the main LlamaForCausalLM model class."""
+    def test_init_8b_variant(self, mock_vllm_config_8b, rng, mesh):
+        """Tests correct parameter detection for the 8B model variant."""
+        model = LlamaForCausalLM(mock_vllm_config_8b, rng, mesh)
+        assert model.hidden_size == 4096
+        assert "8b" in model.vllm_config.model_config.model.lower()
+    def test_init_70b_variant(self, mock_vllm_config_70b, rng, mesh):
+        """Tests correct parameter detection for the 70B model variant."""
+        model = nnx.eval_shape(
+            lambda: LlamaForCausalLM(mock_vllm_config_70b, rng, mesh))
+        assert model.hidden_size == 8192
+        assert "70b" in model.vllm_config.model_config.model.lower()
+    def test_init_unknown_variant_raises_error(self, mock_vllm_config_unknown,
+                                               rng, mesh):
+        """Tests that an unknown model variant raises a ValueError."""
+        with pytest.raises(ValueError,
+                           match="Could not determine Llama3 variant"):
+            LlamaForCausalLM(mock_vllm_config_unknown, rng, mesh)
+    def test_create_model_with_random_weights(self, mock_vllm_config_8b, rng,
+                                              mesh):
+        """
+        Tests that random weight initialization creates concrete, non-zero-variance arrays.
+        """
+        with jax.set_mesh(mesh):
+            model = LlamaForCausalLM(vllm_config=mock_vllm_config_8b,
+                                     rng=rng,
+                                     mesh=mesh,
+                                     force_random_weights=True)
+            embedding_weight = model.embedder.input_embedding_table_VD.value
+            attention_q_kernel = model.layers[0].attn.kernel_q_proj_DNH.value
+            final_norm_scale = model.final_norm.scale.value
+            assert isinstance(embedding_weight, jax.Array)
+            assert isinstance(attention_q_kernel, jax.Array)
+            assert isinstance(final_norm_scale, jax.Array)
+            assert jnp.std(embedding_weight) > 0
+            assert jnp.std(attention_q_kernel) > 0
+            assert jnp.all(final_norm_scale == 1.0)
+    @patch("tpu_inference.experimental.llama3_jax_stashed.Llama3WeightLoader")
+    def test_load_weights_called_correctly(self, mock_loader_cls, rng, mesh):
+        """Tests that the weight loader is called correctly for checkpoint loading."""
+        vllm_config = MockVllmConfig(model_name="llama3-8b",
+                                     random_weights=False)
+        model = LlamaForCausalLM(vllm_config, rng, mesh)
+        mock_loader_instance = MagicMock()
+        mock_loader_cls.return_value = mock_loader_instance
+        model.load_weights(rng, cache_dir="/tmp/cache")
+        mock_loader_cls.assert_called_once_with(vllm_config=vllm_config,
+                                                hidden_size=4096,
+                                                attn_heads=32,
+                                                num_key_value_heads=8,
+                                                attn_head_dim=128)
+        mock_loader_instance.load_weights.assert_called_once_with(model)
+class TestLlama3WeightLoader:
+    """Tests for the Llama3WeightLoader class."""
+    @pytest.fixture
+    def weight_loader(self):
+        # Patch the superclass's setup to isolate the Llama3 loader's logic
+        return Llama3WeightLoader(vllm_config=MockVllmConfig("test-model"),
+                                  hidden_size=32,
+                                  attn_heads=4,
+                                  num_key_value_heads=2,
+                                  attn_head_dim=8)
+    def test_load_weights_transformation(self, weight_loader, rng, mesh):
+        """Tests that weights are correctly reshaped, transposed, and loaded."""
+        vllm_config = MockVllmConfig("llama3-8b-small-test",
+                                     random_weights=False)
+        # Create a model instance but override its config for the test.
+        model = LlamaForCausalLM(vllm_config, rng, mesh)
+        with patch(
+                "tpu_inference.experimental.llama3_jax_stashed.load_hf_weights"
+        ) as mock_load:
+            # This will now pass after the code fix
+            weight_loader.load_weights(model)
+            # Assert that shard_put was called with the correctly transposed weight
+            mock_load.assert_called_once()

tests/kernels/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tests/kernels/collectives/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tests/kernels/collectives/all_gather_matmul_kernel_test.py ADDED Viewed

@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+import jax
+import jax.numpy as jnp
+from absl.testing import absltest, parameterized
+from jax._src import test_util as jtu
+from tpu_inference import utils
+from tpu_inference.kernels.collectives import all_gather_matmul
+jax.config.parse_flags_with_absl()
+P = jax.sharding.PartitionSpec
+SpongeDir: str | None = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', None)
+@jtu.with_config(jax_numpy_dtype_promotion='standard')
+class AllGatherMatmulTest(jtu.JaxTestCase):
+    @parameterized.product(
+        grid_k=[1, 2, 3],
+        grid_n=[1, 2, 3],
+        rhs_transpose=[True, False],
+    )
+    def test_all_gather_matmul(self, grid_k, grid_n, rhs_transpose):
+        if jax.device_count() != 8:
+            self.skipTest('Not enough devices for test')
+        axis_name = 'x'
+        num_devices = jax.device_count()
+        mesh = utils.make_optimized_mesh((num_devices, ), (axis_name, ))
+        bk, bn = 1024, 1024
+        m, k, n = 1024, bk * grid_k, bn * grid_n * num_devices
+        # Run the test 10 times to expose race conditions as much as possible.
+        for i in range(10):
+            # Create input data
+            prng_key = jax.random.key(1234 + i)
+            k0, k1 = jax.random.split(prng_key, 2)
+            x = jax.random.normal(k0, (m, k), dtype=jnp.bfloat16)
+            y_shape = (n, k) if rhs_transpose else (k, n)
+            y_sharding = P(axis_name, None) if rhs_transpose else P(
+                None, axis_name)
+            y = jax.random.normal(k1, y_shape, dtype=jnp.bfloat16)
+            sharded_x = jax.device_put(
+                x, jax.sharding.NamedSharding(mesh, P(axis_name, None)))
+            sharded_y = jax.device_put(
+                y, jax.sharding.NamedSharding(mesh, y_sharding))
+            # Run the all_gather_matmul function
+            output = all_gather_matmul.all_gather_matmul(
+                sharded_x,
+                sharded_y,
+                mesh,
+                axis_name,
+                bk=bk,
+                bn=bn,
+                rhs_transpose=rhs_transpose,
+            )
+            y_for_dot = sharded_y.T if rhs_transpose else sharded_y
+            expected_output = jnp.dot(sharded_x, y_for_dot)
+            self.assertAllClose(output, expected_output, atol=1e-2, rtol=1e-2)
+if __name__ == "__main__":
+    absltest.main(testLoader=jtu.JaxTestLoader())

tests/kernels/fused_moe_v1_test.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import jax
 import jax.numpy as jnp
 import numpy as np

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.2rc3__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.2rc3py3-none-any.whl