PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (58) hide show

tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +182 -0
tests/test_utils.py +23 -14
tpu_inference/__init__.py +22 -3
tpu_inference/core/core_tpu.py +17 -9
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +2 -3
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +1 -1
tpu_inference/executors/ray_distributed_executor.py +27 -11
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +110 -64
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +7 -0
tpu_inference/layers/{jax → common}/attention_interface.py +1 -1
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/quantization/__init__.py +7 -3
tpu_inference/layers/vllm/quantization/awq.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -2
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +4 -3
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +12 -11
tpu_inference/models/jax/llama3.py +4 -3
tpu_inference/models/jax/llama_eagle3.py +9 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +3 -2
tpu_inference/models/jax/qwen2_5_vl.py +4 -3
tpu_inference/models/jax/qwen3.py +3 -2
tpu_inference/models/jax/utils/weight_utils.py +21 -8
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -10
tpu_inference/platforms/tpu_platform.py +17 -7
tpu_inference/runner/compilation_manager.py +37 -17
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +8 -2
tpu_inference/runner/tpu_runner.py +199 -87
tpu_inference/spec_decode/jax/eagle3.py +2 -1
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +7 -6
tpu_inference/worker/tpu_worker.py +159 -23
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/METADATA +2 -2
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/RECORD +52 -54
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
/tpu_inference/layers/{jax → common}/binary_search.py +0 -0
/tpu_inference/layers/{jax → common}/sharding.py +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/top_level.txt +0 -0

tests/lora/test_layers.py CHANGED Viewed

@@ -91,7 +91,6 @@ def populate_loras(
     index_to_id: list[Optional[int]],
     lora_layer: BaseLayerWithLoRA,
     baselayer_weights: torch.Tensor,
-    generate_embeddings_tensor: int = 0,
     repeats: int = 1,
 ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora weights (lora_a and lora_b) in the lora layers (BaseLayerWithLoRA).
@@ -103,8 +102,6 @@ def populate_loras(
         lora_layer: the LoRAlayer to populate.
         baselayer_weights: the PyTorch tensor containing the layer's
             weights.
-        generate_embeddings_tensor: whether to generate an
-            embeddings tensor for each LoRA.
         repeats: must only be set for column parallel packed
             layers. Indicates the number of loras to compose
             together to create a single lora layer.
@@ -131,7 +128,6 @@ def populate_loras(
                     baselayer_weights.device).init_random_lora(
                         module_name=f"fake_{i}",
                         weight=baselayer_weights,
-                        generate_embeddings_tensor=generate_embeddings_tensor,
                     )
                 sublora.lora_b = sublora.lora_b[(sublora_len *
                                                  i):(sublora_len * (i + 1)), :]
@@ -147,7 +143,6 @@ def populate_loras(
                     slot_idx,
                     lora_a=lora.lora_a,
                     lora_b=lora.lora_b,
-                    embeddings_tensor=lora.embeddings_tensor,
                 )
             lora_dict[lora_id] = lora
@@ -546,7 +541,6 @@ def _update_punica_wrapper_metadata(punica_wrapper, index_mapping,
             index_to_id,
             lora_config.max_loras,
             vocab_size=512,
-            extra_vocab_size=lora_config.lora_extra_vocab_size,
         )
         assert jax_view(punica_wrapper._lora_indices_per_batch).platform(
         ) == 'tpu', 'punica_wrapper._lora_indices_per_batch should have been moved to TPU.'

tests/lora/utils.py CHANGED Viewed

@@ -24,7 +24,6 @@ class DummyLoRAManager:
         module_name: str,
         weight: torch.Tensor,
         rank: int = 8,
-        generate_embeddings_tensor: int = 0,
     ):
         lora = LoRALayerWeights(
             module_name,
@@ -37,13 +36,6 @@ class DummyLoRAManager:
                               dtype=weight.dtype,
                               device=self._device),
         )
-        if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(
-                5,
-                generate_embeddings_tensor,
-                dtype=weight.dtype,
-                device=self._device,
-            )
         self.set_module_lora(module_name, lora)
         return lora

tests/test_envs.py ADDED Viewed

@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the tpu-inference project
+import pytest
+import tpu_inference.envs as envs
+from tpu_inference.envs import enable_envs_cache, environment_variables
+def test_getattr_without_cache(monkeypatch: pytest.MonkeyPatch):
+    assert envs.JAX_PLATFORMS == ""
+    assert envs.PHASED_PROFILING_DIR == ""
+    monkeypatch.setenv("JAX_PLATFORMS", "tpu")
+    monkeypatch.setenv("PHASED_PROFILING_DIR", "/tmp/profiling")
+    assert envs.JAX_PLATFORMS == "tpu"
+    assert envs.PHASED_PROFILING_DIR == "/tmp/profiling"
+    assert envs.TPU_NAME is None
+    assert envs.TPU_ACCELERATOR_TYPE is None
+    monkeypatch.setenv("TPU_NAME", "my-tpu")
+    monkeypatch.setenv("TPU_ACCELERATOR_TYPE", "v5litepod-16")
+    assert envs.TPU_NAME == "my-tpu"
+    assert envs.TPU_ACCELERATOR_TYPE == "v5litepod-16"
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("JAX_PLATFORMS", "tpu")
+    monkeypatch.setenv("TPU_NAME", "my-tpu")
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+    enable_envs_cache()
+    # __getattr__ is decorated with functools.cache
+    assert hasattr(envs.__getattr__, "cache_info")
+    start_hits = envs.__getattr__.cache_info().hits
+    # 2 more hits due to JAX_PLATFORMS and TPU_NAME accesses
+    assert envs.JAX_PLATFORMS == "tpu"
+    assert envs.TPU_NAME == "my-tpu"
+    assert envs.__getattr__.cache_info().hits == start_hits + 2
+    # All environment variables are cached
+    for environment_variable in environment_variables:
+        envs.__getattr__(environment_variable)
+    assert envs.__getattr__.cache_info(
+    ).hits == start_hits + 2 + len(environment_variables)
+    # Reset envs.__getattr__ back to non-cached version to
+    # avoid affecting other tests
+    envs.__getattr__ = envs.__getattr__.__wrapped__
+def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
+    # Test SKIP_JAX_PRECOMPILE (default False)
+    assert envs.SKIP_JAX_PRECOMPILE is False
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "1")
+    assert envs.SKIP_JAX_PRECOMPILE is True
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "0")
+    assert envs.SKIP_JAX_PRECOMPILE is False
+    # Test NEW_MODEL_DESIGN (default False)
+    assert envs.NEW_MODEL_DESIGN is False
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "1")
+    assert envs.NEW_MODEL_DESIGN is True
+    # Test USE_MOE_EP_KERNEL (default False)
+    assert envs.USE_MOE_EP_KERNEL is False
+    monkeypatch.setenv("USE_MOE_EP_KERNEL", "1")
+    assert envs.USE_MOE_EP_KERNEL is True
+def test_integer_env_vars(monkeypatch: pytest.MonkeyPatch):
+    assert envs.PYTHON_TRACER_LEVEL == 1
+    monkeypatch.setenv("PYTHON_TRACER_LEVEL", "3")
+    assert envs.PYTHON_TRACER_LEVEL == 3
+    monkeypatch.setenv("PYTHON_TRACER_LEVEL", "0")
+    assert envs.PYTHON_TRACER_LEVEL == 0
+def test_lowercase_conversion(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "GRPC")
+    assert envs.TPU_MULTIHOST_BACKEND == "grpc"
+    monkeypatch.setenv("MODEL_IMPL_TYPE", "FLAX_NNX")
+    assert envs.MODEL_IMPL_TYPE == "flax_nnx"
+def test_string_env_vars_defaults(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("JAX_PLATFORMS", raising=False)
+    monkeypatch.delenv("PREFILL_SLICES", raising=False)
+    monkeypatch.delenv("DECODE_SLICES", raising=False)
+    assert envs.JAX_PLATFORMS == ""
+    assert envs.PREFILL_SLICES == ""
+    assert envs.DECODE_SLICES == ""
+    assert envs.PHASED_PROFILING_DIR == ""
+def test_none_default_env_vars(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("TPU_ACCELERATOR_TYPE", raising=False)
+    monkeypatch.delenv("TPU_NAME", raising=False)
+    monkeypatch.delenv("TPU_WORKER_ID", raising=False)
+    assert envs.TPU_ACCELERATOR_TYPE is None
+    assert envs.TPU_NAME is None
+    assert envs.TPU_WORKER_ID is None
+def test_ray_env_vars(monkeypatch: pytest.MonkeyPatch):
+    assert envs.RAY_USAGE_STATS_ENABLED == "0"
+    monkeypatch.setenv("RAY_USAGE_STATS_ENABLED", "1")
+    assert envs.RAY_USAGE_STATS_ENABLED == "1"
+    assert envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "shm"
+    monkeypatch.setenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "nccl")
+    assert envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"
+def test_invalid_attribute_raises_error():
+    with pytest.raises(AttributeError,
+                       match="has no attribute 'NONEXISTENT_VAR'"):
+        _ = envs.NONEXISTENT_VAR
+def test_dir_returns_all_env_vars():
+    env_vars = envs.__dir__()
+    assert isinstance(env_vars, list)
+    assert len(env_vars) == len(environment_variables)
+    assert "JAX_PLATFORMS" in env_vars
+    assert "TPU_NAME" in env_vars
+    assert "SKIP_JAX_PRECOMPILE" in env_vars
+    assert "MODEL_IMPL_TYPE" in env_vars
+def test_tpu_multihost_env_vars(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("TPU_WORKER_ID", "0")
+    assert envs.TPU_WORKER_ID == "0"
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "grpc")
+    assert envs.TPU_MULTIHOST_BACKEND == "grpc"
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "xla")
+    assert envs.TPU_MULTIHOST_BACKEND == "xla"
+def test_disaggregated_serving_env_vars(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("PREFILL_SLICES", "0,1,2,3")
+    assert envs.PREFILL_SLICES == "0,1,2,3"
+    monkeypatch.setenv("DECODE_SLICES", "4,5,6,7")
+    assert envs.DECODE_SLICES == "4,5,6,7"
+def test_model_impl_type_default(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("MODEL_IMPL_TYPE", raising=False)
+    assert envs.MODEL_IMPL_TYPE == "flax_nnx"
+def test_cache_preserves_values_across_env_changes(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("JAX_PLATFORMS", "tpu")
+    enable_envs_cache()
+    assert envs.JAX_PLATFORMS == "tpu"
+    # Change environment variable
+    monkeypatch.setenv("JAX_PLATFORMS", "cpu")
+    # Cached value should still be "tpu"
+    assert envs.JAX_PLATFORMS == "tpu"
+    # Reset envs.__getattr__ back to non-cached version
+    envs.__getattr__ = envs.__getattr__.__wrapped__
+    # Now it should reflect the new value
+    assert envs.JAX_PLATFORMS == "cpu"

tests/test_utils.py CHANGED Viewed

@@ -75,25 +75,34 @@ def test_hbm_usage_bytes_pathways_enabled(mock_devices, mock_live_arrays):
     mock_device2 = MagicMock()
     devices = [mock_device1, mock_device2]
-    # Create mock device buffers
-    mock_buffer1_dev1 = MagicMock()
-    mock_buffer1_dev1.device = mock_device1
-    mock_buffer1_dev1.nbytes = 2000  # 2000 bytes on device1
+    # Create mock addressable shards with data property
+    mock_data1_dev1 = MagicMock()
+    mock_data1_dev1.device = mock_device1
+    mock_data1_dev1.nbytes = 2000  # 2000 bytes on device1
-    mock_buffer1_dev2 = MagicMock()
-    mock_buffer1_dev2.device = mock_device2
-    mock_buffer1_dev2.nbytes = 2000  # 2000 bytes on device2
+    mock_data1_dev2 = MagicMock()
+    mock_data1_dev2.device = mock_device2
+    mock_data1_dev2.nbytes = 2000  # 2000 bytes on device2
-    mock_buffer2_dev1 = MagicMock()
-    mock_buffer2_dev1.device = mock_device1
-    mock_buffer2_dev1.nbytes = 1000  # 1000 bytes on device1
+    mock_data2_dev1 = MagicMock()
+    mock_data2_dev1.device = mock_device1
+    mock_data2_dev1.nbytes = 1000  # 1000 bytes on device1
-    # Create mock arrays with device buffers
+    mock_shard1_dev1 = MagicMock()
+    mock_shard1_dev1.data = mock_data1_dev1
+    mock_shard1_dev2 = MagicMock()
+    mock_shard1_dev2.data = mock_data1_dev2
+    mock_shard2_dev1 = MagicMock()
+    mock_shard2_dev1.data = mock_data2_dev1
+    # Create mock arrays with addressable_shards
     mock_array1 = MagicMock()
-    mock_array1.device_buffers = [mock_buffer1_dev1, mock_buffer1_dev2]
+    mock_array1.addressable_shards = [mock_shard1_dev1, mock_shard1_dev2]
     mock_array2 = MagicMock()
-    mock_array2.device_buffers = [mock_buffer2_dev1]
+    mock_array2.addressable_shards = [mock_shard2_dev1]
     mock_live_arrays.return_value = [mock_array1, mock_array2]
@@ -159,7 +168,7 @@ def test_hbm_usage_bytes_pathways_no_arrays(mock_devices, mock_live_arrays):
     "head_dim, expected_padded_head_dim",
     [
         (1, 128),
-        (64, 128),
+        (64, 64),
         (127, 128),
         (128, 128),
         (129, 256),

tpu_inference/__init__.py CHANGED Viewed

@@ -1,21 +1,40 @@
-import os
 # The environment variables override should be imported before any other
 # modules to ensure that the environment variables are set before any
 # other modules are imported.
 import tpu_inference.env_override  # noqa: F401
+from tpu_inference import envs
 from tpu_inference import tpu_info as ti
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)
-if "proxy" in os.environ.get('JAX_PLATFORMS', '').lower():
+if "proxy" in envs.JAX_PLATFORMS:
     logger.info("Running vLLM on TPU via Pathways proxy.")
     # Must run pathwaysutils.initialize() before any JAX operations
     try:
+        import traceback
         import pathwaysutils
+        import vllm
+        from vllm.platforms import (resolve_current_platform_cls_qualname,
+                                    resolve_obj_by_qualname)
         pathwaysutils.initialize()
         logger.info("Module pathwaysutils is imported.")
+        # Pathways requires eager resolution of vllm.current_platform instead of
+        # lazy resolution in the normal code path. Since this part involves
+        # global topology discovery across multiple hosts, the platform
+        # resolution must happen before other components are loaded.
+        logger.info("Eagerly resolving vLLM current_platform for Pathways.")
+        platform_cls_qualname = resolve_current_platform_cls_qualname()
+        resolved_platform_instance = resolve_obj_by_qualname(
+            platform_cls_qualname)()
+        vllm.platforms._current_platform = resolved_platform_instance
+        vllm.platforms._init_trace = "".join(traceback.format_stack())
+        logger.info(
+            f"vLLM platform resolved to: {resolved_platform_instance.__class__.__name__}"
+        )
     except Exception as e:
         logger.error(
             f"Error occurred while importing pathwaysutils or logging TPU info: {e}"

tpu_inference/core/core_tpu.py CHANGED Viewed

@@ -29,6 +29,7 @@ from vllm.v1.request import Request, RequestStatus
 from tpu_inference import utils as common_utils
 from tpu_inference.core import disagg_executor, disagg_utils
+from tpu_inference.runner.tpu_runner import AsyncTPUModelRunnerOutput
 # ======================================================================================
 # Imports for _DisaggOrchestrator (decoupled from vLLM)
 # ======================================================================================
@@ -186,6 +187,8 @@ class _DisaggOrchestrator:
                     if model_output is None:
                         model_output = prefill_engine.model_executor.sample_tokens(
                             grammar_output)
+                    if isinstance(model_output, AsyncTPUModelRunnerOutput):
+                        model_output = model_output.get_output()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 logger.debug(f"Prefill result: {model_output}")
@@ -218,15 +221,16 @@ class _DisaggOrchestrator:
                             f"request-{req_id}: tokens={request.all_token_ids} after prefill"
                         )
                         # Remove request from the prefill engine.
+                        if req_id in prefill_engine.scheduler.requests:
+                            request = prefill_engine.scheduler.requests[req_id]
+                            prefill_engine.scheduler.running.remove(request)
+                            prefill_engine.scheduler.encoder_cache_manager.free(
+                                request)
-                        request = prefill_engine.scheduler.requests[req_id]
-                        prefill_engine.scheduler.running.remove(request)
-                        prefill_engine.scheduler.encoder_cache_manager.free(
-                            request)
+                            prefill_engine.scheduler.kv_cache_manager.free(
+                                request)
-                        prefill_engine.scheduler.kv_cache_manager.free(request)
-                        prefill_engine.scheduler.requests.pop(req_id)
+                            prefill_engine.scheduler.requests.pop(req_id)
                 for output in (engine_core_outputs.items()
                                if engine_core_outputs else ()):
@@ -335,8 +339,10 @@ class _DisaggOrchestrator:
                 new_block_ids = kv_cache_manager.get_block_ids(req_id)
                 logger.debug(
                     f"inserting {req_id} new_block_ids {new_block_ids}")
-                assert (len(new_block_ids[0]) == math.ceil(
-                    prompt_tokens / self._config.cache_config.block_size))
+                if len(new_block_ids[0]) != math.ceil(
+                        prompt_tokens / self._config.cache_config.block_size):
+                    logger.warning("Running out of blocks in decode engine! ")
+                    break
                 decode_engine.model_executor.driver_worker.model_runner.insert_request_with_kv_cache(
                     vllm_request, kv_cache, new_block_ids)
@@ -366,6 +372,8 @@ class _DisaggOrchestrator:
                     if model_output is None:
                         model_output = decode_engine.model_executor.sample_tokens(
                             grammar_output)
+                    if isinstance(model_output, AsyncTPUModelRunnerOutput):
+                        model_output = model_output.get_output()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 logger.debug(f"Decode result: {model_output}")

tpu_inference/core/disagg_utils.py CHANGED Viewed

@@ -1,17 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
 from typing import Tuple
-PREFILL_SLICES = 'PREFILL_SLICES'
-DECODE_SLICES = 'DECODE_SLICES'
+from tpu_inference import envs
 def is_disagg_enabled() -> bool:
     # We triggrer our code path as long as prefill slices are set. This
     # allows us to test interleave mode effectively with the code path
     # for comparison purposes.
-    return PREFILL_SLICES in os.environ
+    return bool(envs.PREFILL_SLICES)
 def _parse_slices(slices_str: str) -> Tuple[int, ...]:
@@ -40,12 +38,12 @@ def _parse_slices(slices_str: str) -> Tuple[int, ...]:
 def get_prefill_slices() -> Tuple[int, ...]:
-    if PREFILL_SLICES not in os.environ:
+    if not envs.PREFILL_SLICES:
         return ()
-    return _parse_slices(os.environ[PREFILL_SLICES])
+    return _parse_slices(envs.PREFILL_SLICES)
 def get_decode_slices() -> Tuple[int, ...]:
-    if DECODE_SLICES not in os.environ:
+    if not envs.DECODE_SLICES:
         return ()
-    return _parse_slices(os.environ[DECODE_SLICES])
+    return _parse_slices(envs.DECODE_SLICES)

tpu_inference/distributed/tpu_connector.py CHANGED Viewed

@@ -60,7 +60,6 @@ D workflow:
 import copy
 import functools
-import os
 import threading
 import time
 from concurrent.futures import Future, ThreadPoolExecutor
@@ -86,6 +85,7 @@ if TYPE_CHECKING:
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
+from tpu_inference import envs
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_ips,
                                              get_kv_ports,
                                              get_kv_transfer_port, get_node_id,
@@ -441,8 +441,7 @@ class TPUConnectorWorker:
         self.runner: TPUModelRunner = None
         self.mesh: Mesh = None
-        self.multi_host = os.getenv("TPU_MULTIHOST_BACKEND",
-                                    "").lower() == "ray"
+        self.multi_host = envs.TPU_MULTIHOST_BACKEND == "ray"
         # NOTE(xiang): This can not be the worker rank set in RayDistributedExecutor.
         # The worker rank is assigned with vLLM's sorting logic, which does not work
         # for TPU host topology.

tpu_inference/distributed/utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from vllm.utils.network_utils import get_ip
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)
@@ -17,7 +18,7 @@ def set_node_kv_ip_port(ip_port: tuple[int, str, int]):
 def get_kv_ips() -> str:
-    if os.getenv("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
+    if envs.TPU_MULTIHOST_BACKEND == "ray":
         num_nodes = len(_NODES_KV_IP_PORT)
         ips = []
         for node_id in range(num_nodes):
@@ -28,7 +29,7 @@ def get_kv_ips() -> str:
 def get_kv_ports() -> str:
-    if os.getenv("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
+    if envs.TPU_MULTIHOST_BACKEND == "ray":
         num_nodes = len(_NODES_KV_IP_PORT)
         ports = []
         for node_id in range(num_nodes):

tpu_inference/envs.py CHANGED Viewed

@@ -26,7 +26,7 @@ if TYPE_CHECKING:
 environment_variables: dict[str, Callable[[], Any]] = {
     # JAX platform selection (e.g., "tpu", "cpu", "proxy")
     "JAX_PLATFORMS":
-    lambda: os.getenv("JAX_PLATFORMS", ""),
+    lambda: os.getenv("JAX_PLATFORMS", "").lower(),
     # TPU accelerator type (e.g., "v5litepod-16", "v4-8")
     "TPU_ACCELERATOR_TYPE":
     lambda: os.getenv("TPU_ACCELERATOR_TYPE", None),

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -108,6 +108,9 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
             ip_port = self.collective_rpc("get_node_kv_ip_port")
             for item in ip_port:
                 set_node_kv_ip_port(item)
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer)
     def _initialize_ray_cluster(self) -> None:
         """Initialize the distributed cluster with Ray.
@@ -131,10 +134,17 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 f"current platform {current_platform.device_name} does not "
                 "support ray.")
-        placement_group_specs: List[Dict[str, float]] = [{
-            device_str:
-            node['Resources'][device_str]
-        } for node in ray.nodes()]
+        pp_size = self.parallel_config.pipeline_parallel_size
+        placement_group_specs: List[Dict[str, float]] = []
+        if pp_size == 1:
+            placement_group_specs = [{
+                device_str: node['Resources'][device_str]
+            } for node in ray.nodes()]
+        else:
+            num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
+            placement_group_specs = [{
+                device_str: num_devices_per_pp_rank
+            } for _ in range(pp_size)]
         # vLLM engine is also a worker to execute model with an accelerator,
         # so it requires to have the device in a current node. Check if
@@ -329,6 +339,8 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
         all_kwargs = []
         for rank, (node_id, _) in enumerate(worker_node_and_tpu_ids):
             local_rank = node_workers[node_id].index(rank)
+            ip = sorted_worker_metadata[rank].ip
+            prev_ip = sorted_worker_metadata[rank - 1].ip if rank > 0 else ""
             kwargs = dict(
                 vllm_config=self.vllm_config,
                 local_rank=local_rank,
@@ -336,22 +348,26 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 distributed_init_method=distributed_init_method,
                 is_driver_worker=(not self.parallel_config)
                 or (rank % self.parallel_config.tensor_parallel_size == 0),
+                ip=ip,
+                prev_worker_ip=prev_ip,
             )
             all_kwargs.append(kwargs)
         self.collective_rpc("init_worker", args=(all_kwargs, ))
         self.collective_rpc("init_device")
+        if self.parallel_config.pipeline_parallel_size > 1:
+            self.collective_rpc("initialize_pp_transfer_connect")
         self.collective_rpc("load_model")
         if self.use_ray_spmd_worker:
             for pp_rank in range(self.parallel_config.pipeline_parallel_size):
                 self.pp_tp_workers.append([])
-                for tp_rank in range(
-                        int(self.parallel_config.tensor_parallel_size //
-                            num_tpu_per_worker)):
-                    # PP=2, TP=4
-                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                            ) + tp_rank
+                num_tp_workers = int(
+                    self.parallel_config.tensor_parallel_size //
+                    num_tpu_per_worker)
+                for tp_rank in range(num_tp_workers):
+                    # PP=2, TP=4, num_tpu_per_worker=2
+                    # pp_tp_workers = [[0, 1], [2, 3]]
+                    rank = (pp_rank * num_tp_workers) + tp_rank
                     assert len(self.pp_tp_workers[pp_rank]) == tp_rank
                     assert pp_rank < len(self.pp_tp_workers)
                     self.pp_tp_workers[pp_rank].append(self.workers[rank])

tpu-inference 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl