PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (54) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +32 -11
tests/test_utils.py +1 -2
tpu_inference/__init__.py +22 -3
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +3 -4
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +61 -8
tpu_inference/executors/ray_distributed_executor.py +31 -11
tpu_inference/kernels/fused_moe/v1/kernel.py +641 -110
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +213 -126
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +74 -25
tpu_inference/layers/vllm/quantization/common.py +6 -1
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -62
tpu_inference/layers/vllm/quantization/unquantized.py +107 -113
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +45 -11
tpu_inference/models/jax/llama3.py +2 -1
tpu_inference/models/jax/llama_eagle3.py +8 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +2 -1
tpu_inference/models/jax/qwen2_5_vl.py +163 -48
tpu_inference/models/jax/qwen3.py +2 -1
tpu_inference/models/jax/utils/quantization/quantization_utils.py +3 -6
tpu_inference/models/jax/utils/weight_utils.py +198 -143
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -7
tpu_inference/platforms/tpu_platform.py +28 -22
tpu_inference/runner/compilation_manager.py +144 -59
tpu_inference/runner/kv_cache_manager.py +17 -18
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +271 -147
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -21
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +36 -13
tpu_inference/worker/tpu_worker.py +162 -25
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/METADATA +3 -2
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/RECORD +48 -53
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/top_level.txt +0 -0

tpu_inference/distributed/utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from vllm.utils.network_utils import get_ip
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)
@@ -17,7 +18,7 @@ def set_node_kv_ip_port(ip_port: tuple[int, str, int]):
 def get_kv_ips() -> str:
-    if os.getenv("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
+    if envs.TPU_MULTIHOST_BACKEND == "ray":
         num_nodes = len(_NODES_KV_IP_PORT)
         ips = []
         for node_id in range(num_nodes):
@@ -28,7 +29,7 @@ def get_kv_ips() -> str:
 def get_kv_ports() -> str:
-    if os.getenv("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
+    if envs.TPU_MULTIHOST_BACKEND == "ray":
         num_nodes = len(_NODES_KV_IP_PORT)
         ports = []
         for node_id in range(num_nodes):

tpu_inference/envs.py CHANGED Viewed

@@ -15,18 +15,64 @@ if TYPE_CHECKING:
     PREFILL_SLICES: str = ""
     DECODE_SLICES: str = ""
     SKIP_JAX_PRECOMPILE: bool = False
+    VLLM_XLA_CHECK_RECOMPILATION: bool = False
     MODEL_IMPL_TYPE: str = "flax_nnx"
     NEW_MODEL_DESIGN: bool = False
     PHASED_PROFILING_DIR: str = ""
     PYTHON_TRACER_LEVEL: int = 1
     USE_MOE_EP_KERNEL: bool = False
+    NUM_SLICES: int = 1
     RAY_USAGE_STATS_ENABLED: str = "0"
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "shm"
+def env_with_choices(
+    env_name: str,
+    default: str | None,
+    choices: list[str] | Callable[[], list[str]],
+    case_sensitive: bool = True,
+) -> Callable[[], str | None]:
+    """
+    Create a lambda that validates environment variable against allowed choices
+    Args:
+        env_name: Name of the environment variable
+        default: Default value if not set (can be None)
+        choices: List of valid string options or callable that returns list
+        case_sensitive: Whether validation should be case sensitive
+    Returns:
+        Lambda function for environment_variables dict
+    """
+    def _get_validated_env() -> str | None:
+        value = os.getenv(env_name)
+        if value is None:
+            return default
+        # Resolve choices if it's a callable (for lazy loading)
+        actual_choices = choices() if callable(choices) else choices
+        if not case_sensitive:
+            check_value = value.lower()
+            check_choices = [choice.lower() for choice in actual_choices]
+        else:
+            check_value = value
+            check_choices = actual_choices
+        if check_value not in check_choices:
+            raise ValueError(f"Invalid value '{value}' for {env_name}. "
+                             f"Valid options: {actual_choices}.")
+        return value
+    return _get_validated_env
 environment_variables: dict[str, Callable[[], Any]] = {
     # JAX platform selection (e.g., "tpu", "cpu", "proxy")
     "JAX_PLATFORMS":
-    lambda: os.getenv("JAX_PLATFORMS", ""),
+    lambda: os.getenv("JAX_PLATFORMS", "").lower(),
     # TPU accelerator type (e.g., "v5litepod-16", "v4-8")
     "TPU_ACCELERATOR_TYPE":
     lambda: os.getenv("TPU_ACCELERATOR_TYPE", None),
@@ -38,7 +84,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("TPU_WORKER_ID", None),
     # Backend for multi-host communication on TPU
     "TPU_MULTIHOST_BACKEND":
-    lambda: os.getenv("TPU_MULTIHOST_BACKEND", "").lower(),
+    env_with_choices("TPU_MULTIHOST_BACKEND", "", ["ray"]),
     # Slice configuration for disaggregated prefill workers
     "PREFILL_SLICES":
     lambda: os.getenv("PREFILL_SLICES", ""),
@@ -47,28 +93,35 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("DECODE_SLICES", ""),
     # Skip JAX precompilation step during initialization
     "SKIP_JAX_PRECOMPILE":
-    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE", "0"))),
+    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE") or "0")),
+    # Check for XLA recompilation during execution
+    "VLLM_XLA_CHECK_RECOMPILATION":
+    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION") or "0")),
     # Model implementation type (e.g., "flax_nnx")
     "MODEL_IMPL_TYPE":
-    lambda: os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower(),
+    env_with_choices("MODEL_IMPL_TYPE", "flax_nnx",
+                     ["vllm", "flax_nnx", "jetpack"]),
     # Enable new experimental model design
     "NEW_MODEL_DESIGN":
-    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN", "0"))),
+    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN") or "0")),
     # Directory to store phased profiling output
     "PHASED_PROFILING_DIR":
     lambda: os.getenv("PHASED_PROFILING_DIR", ""),
     # Python tracer level for profiling
     "PYTHON_TRACER_LEVEL":
-    lambda: int(os.getenv("PYTHON_TRACER_LEVEL", "1")),
+    lambda: int(os.getenv("PYTHON_TRACER_LEVEL") or "1"),
     # Use custom expert-parallel kernel for MoE (Mixture of Experts)
     "USE_MOE_EP_KERNEL":
-    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL", "0"))),
+    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL") or "0")),
+    # Number of TPU slices for multi-slice mesh
+    "NUM_SLICES":
+    lambda: int(os.getenv("NUM_SLICES") or "1"),
     # Enable/disable Ray usage statistics collection
     "RAY_USAGE_STATS_ENABLED":
     lambda: os.getenv("RAY_USAGE_STATS_ENABLED", "0"),
     # Ray compiled DAG channel type for TPU
     "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm"),
+    env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm", ["shm"]),
 }

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -108,6 +108,9 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
             ip_port = self.collective_rpc("get_node_kv_ip_port")
             for item in ip_port:
                 set_node_kv_ip_port(item)
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer)
     def _initialize_ray_cluster(self) -> None:
         """Initialize the distributed cluster with Ray.
@@ -131,10 +134,21 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 f"current platform {current_platform.device_name} does not "
                 "support ray.")
-        placement_group_specs: List[Dict[str, float]] = [{
-            device_str:
-            node['Resources'][device_str]
-        } for node in ray.nodes()]
+        pp_size = self.parallel_config.pipeline_parallel_size
+        placement_group_specs: List[Dict[str, float]] = []
+        ray_nodes = ray.nodes()
+        logger.info(f"RayDistributedExecutor | ray_nodes={ray_nodes}")
+        if pp_size == 1:
+            placement_group_specs = [{
+                device_str: node['Resources'][device_str]
+            } for node in ray_nodes]
+        else:
+            num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
+            placement_group_specs = [{
+                device_str: num_devices_per_pp_rank
+            } for _ in range(pp_size)]
         # vLLM engine is also a worker to execute model with an accelerator,
         # so it requires to have the device in a current node. Check if
@@ -329,6 +343,8 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
         all_kwargs = []
         for rank, (node_id, _) in enumerate(worker_node_and_tpu_ids):
             local_rank = node_workers[node_id].index(rank)
+            ip = sorted_worker_metadata[rank].ip
+            prev_ip = sorted_worker_metadata[rank - 1].ip if rank > 0 else ""
             kwargs = dict(
                 vllm_config=self.vllm_config,
                 local_rank=local_rank,
@@ -336,22 +352,26 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 distributed_init_method=distributed_init_method,
                 is_driver_worker=(not self.parallel_config)
                 or (rank % self.parallel_config.tensor_parallel_size == 0),
+                ip=ip,
+                prev_worker_ip=prev_ip,
             )
             all_kwargs.append(kwargs)
         self.collective_rpc("init_worker", args=(all_kwargs, ))
         self.collective_rpc("init_device")
+        if self.parallel_config.pipeline_parallel_size > 1:
+            self.collective_rpc("initialize_pp_transfer_connect")
         self.collective_rpc("load_model")
         if self.use_ray_spmd_worker:
             for pp_rank in range(self.parallel_config.pipeline_parallel_size):
                 self.pp_tp_workers.append([])
-                for tp_rank in range(
-                        int(self.parallel_config.tensor_parallel_size //
-                            num_tpu_per_worker)):
-                    # PP=2, TP=4
-                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                            ) + tp_rank
+                num_tp_workers = int(
+                    self.parallel_config.tensor_parallel_size //
+                    num_tpu_per_worker)
+                for tp_rank in range(num_tp_workers):
+                    # PP=2, TP=4, num_tpu_per_worker=2
+                    # pp_tp_workers = [[0, 1], [2, 3]]
+                    rank = (pp_rank * num_tp_workers) + tp_rank
                     assert len(self.pp_tp_workers[pp_rank]) == tp_rank
                     assert pp_rank < len(self.pp_tp_workers)
                     self.pp_tp_workers[pp_rank].append(self.workers[rank])

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl