PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tpu_inference/distributed/tpu_connector.py CHANGED Viewed

@@ -60,6 +60,7 @@ D workflow:
 import copy
 import functools
+import os
 import threading
 import time
 from concurrent.futures import Future, ThreadPoolExecutor
@@ -85,7 +86,6 @@ if TYPE_CHECKING:
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
-from tpu_inference import envs
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_ips,
                                              get_kv_ports,
                                              get_kv_transfer_port, get_node_id,
@@ -441,7 +441,8 @@ class TPUConnectorWorker:
         self.runner: TPUModelRunner = None
         self.mesh: Mesh = None
-        self.multi_host = envs.TPU_MULTIHOST_BACKEND == "ray"
+        self.multi_host = os.getenv("TPU_MULTIHOST_BACKEND",
+                                    "").lower() == "ray"
         # NOTE(xiang): This can not be the worker rank set in RayDistributedExecutor.
         # The worker rank is assigned with vLLM's sorting logic, which does not work
         # for TPU host topology.
@@ -457,6 +458,7 @@ class TPUConnectorWorker:
         self.side_channel_port = get_side_channel_port()
         self.kv_transfer_server = None
+        self._maybe_start_p2p_server()
         self.zmq_cxt = zmq.Context()
         if self.is_producer:
             ready_event = threading.Event()
@@ -498,7 +500,6 @@ class TPUConnectorWorker:
         self.shape = list(kv_layer.shape)
         self.dtype = kv_layer.dtype
         self.sharding = kv_layer.sharding
-        self._maybe_start_p2p_server()
     def _maybe_start_p2p_server(self):
         if self.kv_transfer_server is not None:

tpu_inference/distributed/utils.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 from vllm.utils.network_utils import get_ip
-from tpu_inference import envs
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)
@@ -18,7 +17,7 @@ def set_node_kv_ip_port(ip_port: tuple[int, str, int]):
 def get_kv_ips() -> str:
-    if envs.TPU_MULTIHOST_BACKEND == "ray":
+    if os.getenv("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
         num_nodes = len(_NODES_KV_IP_PORT)
         ips = []
         for node_id in range(num_nodes):
@@ -29,7 +28,7 @@ def get_kv_ips() -> str:
 def get_kv_ports() -> str:
-    if envs.TPU_MULTIHOST_BACKEND == "ray":
+    if os.getenv("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
         num_nodes = len(_NODES_KV_IP_PORT)
         ports = []
         for node_id in range(num_nodes):

tpu_inference/envs.py CHANGED Viewed

@@ -15,64 +15,18 @@ if TYPE_CHECKING:
     PREFILL_SLICES: str = ""
     DECODE_SLICES: str = ""
     SKIP_JAX_PRECOMPILE: bool = False
-    VLLM_XLA_CHECK_RECOMPILATION: bool = False
     MODEL_IMPL_TYPE: str = "flax_nnx"
     NEW_MODEL_DESIGN: bool = False
     PHASED_PROFILING_DIR: str = ""
     PYTHON_TRACER_LEVEL: int = 1
     USE_MOE_EP_KERNEL: bool = False
-    NUM_SLICES: int = 1
     RAY_USAGE_STATS_ENABLED: str = "0"
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "shm"
-def env_with_choices(
-    env_name: str,
-    default: str | None,
-    choices: list[str] | Callable[[], list[str]],
-    case_sensitive: bool = True,
-) -> Callable[[], str | None]:
-    """
-    Create a lambda that validates environment variable against allowed choices
-    Args:
-        env_name: Name of the environment variable
-        default: Default value if not set (can be None)
-        choices: List of valid string options or callable that returns list
-        case_sensitive: Whether validation should be case sensitive
-    Returns:
-        Lambda function for environment_variables dict
-    """
-    def _get_validated_env() -> str | None:
-        value = os.getenv(env_name)
-        if value is None:
-            return default
-        # Resolve choices if it's a callable (for lazy loading)
-        actual_choices = choices() if callable(choices) else choices
-        if not case_sensitive:
-            check_value = value.lower()
-            check_choices = [choice.lower() for choice in actual_choices]
-        else:
-            check_value = value
-            check_choices = actual_choices
-        if check_value not in check_choices:
-            raise ValueError(f"Invalid value '{value}' for {env_name}. "
-                             f"Valid options: {actual_choices}.")
-        return value
-    return _get_validated_env
 environment_variables: dict[str, Callable[[], Any]] = {
     # JAX platform selection (e.g., "tpu", "cpu", "proxy")
     "JAX_PLATFORMS":
-    lambda: os.getenv("JAX_PLATFORMS", "").lower(),
+    lambda: os.getenv("JAX_PLATFORMS", ""),
     # TPU accelerator type (e.g., "v5litepod-16", "v4-8")
     "TPU_ACCELERATOR_TYPE":
     lambda: os.getenv("TPU_ACCELERATOR_TYPE", None),
@@ -84,7 +38,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("TPU_WORKER_ID", None),
     # Backend for multi-host communication on TPU
     "TPU_MULTIHOST_BACKEND":
-    env_with_choices("TPU_MULTIHOST_BACKEND", "", ["ray"]),
+    lambda: os.getenv("TPU_MULTIHOST_BACKEND", "").lower(),
     # Slice configuration for disaggregated prefill workers
     "PREFILL_SLICES":
     lambda: os.getenv("PREFILL_SLICES", ""),
@@ -93,35 +47,28 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("DECODE_SLICES", ""),
     # Skip JAX precompilation step during initialization
     "SKIP_JAX_PRECOMPILE":
-    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE") or "0")),
-    # Check for XLA recompilation during execution
-    "VLLM_XLA_CHECK_RECOMPILATION":
-    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION") or "0")),
+    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE", "0"))),
     # Model implementation type (e.g., "flax_nnx")
     "MODEL_IMPL_TYPE":
-    env_with_choices("MODEL_IMPL_TYPE", "flax_nnx",
-                     ["vllm", "flax_nnx", "jetpack"]),
+    lambda: os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower(),
     # Enable new experimental model design
     "NEW_MODEL_DESIGN":
-    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN") or "0")),
+    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN", "0"))),
     # Directory to store phased profiling output
     "PHASED_PROFILING_DIR":
     lambda: os.getenv("PHASED_PROFILING_DIR", ""),
     # Python tracer level for profiling
     "PYTHON_TRACER_LEVEL":
-    lambda: int(os.getenv("PYTHON_TRACER_LEVEL") or "1"),
+    lambda: int(os.getenv("PYTHON_TRACER_LEVEL", "1")),
     # Use custom expert-parallel kernel for MoE (Mixture of Experts)
     "USE_MOE_EP_KERNEL":
-    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL") or "0")),
-    # Number of TPU slices for multi-slice mesh
-    "NUM_SLICES":
-    lambda: int(os.getenv("NUM_SLICES") or "1"),
+    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL", "0"))),
     # Enable/disable Ray usage statistics collection
     "RAY_USAGE_STATS_ENABLED":
     lambda: os.getenv("RAY_USAGE_STATS_ENABLED", "0"),
     # Ray compiled DAG channel type for TPU
     "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm", ["shm"]),
+    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm"),
 }

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -108,9 +108,6 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
             ip_port = self.collective_rpc("get_node_kv_ip_port")
             for item in ip_port:
                 set_node_kv_ip_port(item)
-        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
-            self.vllm_config.ec_transfer_config is None
-            or not self.vllm_config.ec_transfer_config.is_ec_producer)
     def _initialize_ray_cluster(self) -> None:
         """Initialize the distributed cluster with Ray.
@@ -134,21 +131,10 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 f"current platform {current_platform.device_name} does not "
                 "support ray.")
-        pp_size = self.parallel_config.pipeline_parallel_size
-        placement_group_specs: List[Dict[str, float]] = []
-        ray_nodes = ray.nodes()
-        logger.info(f"RayDistributedExecutor | ray_nodes={ray_nodes}")
-        if pp_size == 1:
-            placement_group_specs = [{
-                device_str: node['Resources'][device_str]
-            } for node in ray_nodes]
-        else:
-            num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
-            placement_group_specs = [{
-                device_str: num_devices_per_pp_rank
-            } for _ in range(pp_size)]
+        placement_group_specs: List[Dict[str, float]] = [{
+            device_str:
+            node['Resources'][device_str]
+        } for node in ray.nodes()]
         # vLLM engine is also a worker to execute model with an accelerator,
         # so it requires to have the device in a current node. Check if
@@ -343,8 +329,6 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
         all_kwargs = []
         for rank, (node_id, _) in enumerate(worker_node_and_tpu_ids):
             local_rank = node_workers[node_id].index(rank)
-            ip = sorted_worker_metadata[rank].ip
-            prev_ip = sorted_worker_metadata[rank - 1].ip if rank > 0 else ""
             kwargs = dict(
                 vllm_config=self.vllm_config,
                 local_rank=local_rank,
@@ -352,26 +336,22 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 distributed_init_method=distributed_init_method,
                 is_driver_worker=(not self.parallel_config)
                 or (rank % self.parallel_config.tensor_parallel_size == 0),
-                ip=ip,
-                prev_worker_ip=prev_ip,
             )
             all_kwargs.append(kwargs)
         self.collective_rpc("init_worker", args=(all_kwargs, ))
         self.collective_rpc("init_device")
-        if self.parallel_config.pipeline_parallel_size > 1:
-            self.collective_rpc("initialize_pp_transfer_connect")
         self.collective_rpc("load_model")
         if self.use_ray_spmd_worker:
             for pp_rank in range(self.parallel_config.pipeline_parallel_size):
                 self.pp_tp_workers.append([])
-                num_tp_workers = int(
-                    self.parallel_config.tensor_parallel_size //
-                    num_tpu_per_worker)
-                for tp_rank in range(num_tp_workers):
-                    # PP=2, TP=4, num_tpu_per_worker=2
-                    # pp_tp_workers = [[0, 1], [2, 3]]
-                    rank = (pp_rank * num_tp_workers) + tp_rank
+                for tp_rank in range(
+                        int(self.parallel_config.tensor_parallel_size //
+                            num_tpu_per_worker)):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
                     assert len(self.pp_tp_workers[pp_rank]) == tp_rank
                     assert pp_rank < len(self.pp_tp_workers)
                     self.pp_tp_workers[pp_rank].append(self.workers[rank])

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl