PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (37) hide show

tests/test_envs.py ADDED Viewed

@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the tpu-inference project
+import pytest
+import tpu_inference.envs as envs
+from tpu_inference.envs import enable_envs_cache, environment_variables
+def test_getattr_without_cache(monkeypatch: pytest.MonkeyPatch):
+    assert envs.JAX_PLATFORMS == ""
+    assert envs.PHASED_PROFILING_DIR == ""
+    monkeypatch.setenv("JAX_PLATFORMS", "tpu")
+    monkeypatch.setenv("PHASED_PROFILING_DIR", "/tmp/profiling")
+    assert envs.JAX_PLATFORMS == "tpu"
+    assert envs.PHASED_PROFILING_DIR == "/tmp/profiling"
+    assert envs.TPU_NAME is None
+    assert envs.TPU_ACCELERATOR_TYPE is None
+    monkeypatch.setenv("TPU_NAME", "my-tpu")
+    monkeypatch.setenv("TPU_ACCELERATOR_TYPE", "v5litepod-16")
+    assert envs.TPU_NAME == "my-tpu"
+    assert envs.TPU_ACCELERATOR_TYPE == "v5litepod-16"
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("JAX_PLATFORMS", "tpu")
+    monkeypatch.setenv("TPU_NAME", "my-tpu")
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+    enable_envs_cache()
+    # __getattr__ is decorated with functools.cache
+    assert hasattr(envs.__getattr__, "cache_info")
+    start_hits = envs.__getattr__.cache_info().hits
+    # 2 more hits due to JAX_PLATFORMS and TPU_NAME accesses
+    assert envs.JAX_PLATFORMS == "tpu"
+    assert envs.TPU_NAME == "my-tpu"
+    assert envs.__getattr__.cache_info().hits == start_hits + 2
+    # All environment variables are cached
+    for environment_variable in environment_variables:
+        envs.__getattr__(environment_variable)
+    assert envs.__getattr__.cache_info(
+    ).hits == start_hits + 2 + len(environment_variables)
+    # Reset envs.__getattr__ back to non-cached version to
+    # avoid affecting other tests
+    envs.__getattr__ = envs.__getattr__.__wrapped__
+def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
+    # Test SKIP_JAX_PRECOMPILE (default False)
+    assert envs.SKIP_JAX_PRECOMPILE is False
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "1")
+    assert envs.SKIP_JAX_PRECOMPILE is True
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "0")
+    assert envs.SKIP_JAX_PRECOMPILE is False
+    # Test NEW_MODEL_DESIGN (default False)
+    assert envs.NEW_MODEL_DESIGN is False
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "1")
+    assert envs.NEW_MODEL_DESIGN is True
+    # Test USE_MOE_EP_KERNEL (default False)
+    assert envs.USE_MOE_EP_KERNEL is False
+    monkeypatch.setenv("USE_MOE_EP_KERNEL", "1")
+    assert envs.USE_MOE_EP_KERNEL is True
+def test_integer_env_vars(monkeypatch: pytest.MonkeyPatch):
+    assert envs.PYTHON_TRACER_LEVEL == 1
+    monkeypatch.setenv("PYTHON_TRACER_LEVEL", "3")
+    assert envs.PYTHON_TRACER_LEVEL == 3
+    monkeypatch.setenv("PYTHON_TRACER_LEVEL", "0")
+    assert envs.PYTHON_TRACER_LEVEL == 0
+def test_lowercase_conversion(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "GRPC")
+    assert envs.TPU_MULTIHOST_BACKEND == "grpc"
+    monkeypatch.setenv("MODEL_IMPL_TYPE", "FLAX_NNX")
+    assert envs.MODEL_IMPL_TYPE == "flax_nnx"
+def test_string_env_vars_defaults(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("JAX_PLATFORMS", raising=False)
+    monkeypatch.delenv("PREFILL_SLICES", raising=False)
+    monkeypatch.delenv("DECODE_SLICES", raising=False)
+    assert envs.JAX_PLATFORMS == ""
+    assert envs.PREFILL_SLICES == ""
+    assert envs.DECODE_SLICES == ""
+    assert envs.PHASED_PROFILING_DIR == ""
+def test_none_default_env_vars(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("TPU_ACCELERATOR_TYPE", raising=False)
+    monkeypatch.delenv("TPU_NAME", raising=False)
+    monkeypatch.delenv("TPU_WORKER_ID", raising=False)
+    assert envs.TPU_ACCELERATOR_TYPE is None
+    assert envs.TPU_NAME is None
+    assert envs.TPU_WORKER_ID is None
+def test_ray_env_vars(monkeypatch: pytest.MonkeyPatch):
+    assert envs.RAY_USAGE_STATS_ENABLED == "0"
+    monkeypatch.setenv("RAY_USAGE_STATS_ENABLED", "1")
+    assert envs.RAY_USAGE_STATS_ENABLED == "1"
+    assert envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "shm"
+    monkeypatch.setenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "nccl")
+    assert envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"
+def test_invalid_attribute_raises_error():
+    with pytest.raises(AttributeError,
+                       match="has no attribute 'NONEXISTENT_VAR'"):
+        _ = envs.NONEXISTENT_VAR
+def test_dir_returns_all_env_vars():
+    env_vars = envs.__dir__()
+    assert isinstance(env_vars, list)
+    assert len(env_vars) == len(environment_variables)
+    assert "JAX_PLATFORMS" in env_vars
+    assert "TPU_NAME" in env_vars
+    assert "SKIP_JAX_PRECOMPILE" in env_vars
+    assert "MODEL_IMPL_TYPE" in env_vars
+def test_tpu_multihost_env_vars(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("TPU_WORKER_ID", "0")
+    assert envs.TPU_WORKER_ID == "0"
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "grpc")
+    assert envs.TPU_MULTIHOST_BACKEND == "grpc"
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "xla")
+    assert envs.TPU_MULTIHOST_BACKEND == "xla"
+def test_disaggregated_serving_env_vars(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("PREFILL_SLICES", "0,1,2,3")
+    assert envs.PREFILL_SLICES == "0,1,2,3"
+    monkeypatch.setenv("DECODE_SLICES", "4,5,6,7")
+    assert envs.DECODE_SLICES == "4,5,6,7"
+def test_model_impl_type_default(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.delenv("MODEL_IMPL_TYPE", raising=False)
+    assert envs.MODEL_IMPL_TYPE == "flax_nnx"
+def test_cache_preserves_values_across_env_changes(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("JAX_PLATFORMS", "tpu")
+    enable_envs_cache()
+    assert envs.JAX_PLATFORMS == "tpu"
+    # Change environment variable
+    monkeypatch.setenv("JAX_PLATFORMS", "cpu")
+    # Cached value should still be "tpu"
+    assert envs.JAX_PLATFORMS == "tpu"
+    # Reset envs.__getattr__ back to non-cached version
+    envs.__getattr__ = envs.__getattr__.__wrapped__
+    # Now it should reflect the new value
+    assert envs.JAX_PLATFORMS == "cpu"

tests/test_utils.py CHANGED Viewed

@@ -75,25 +75,34 @@ def test_hbm_usage_bytes_pathways_enabled(mock_devices, mock_live_arrays):
     mock_device2 = MagicMock()
     devices = [mock_device1, mock_device2]
-    # Create mock device buffers
-    mock_buffer1_dev1 = MagicMock()
-    mock_buffer1_dev1.device = mock_device1
-    mock_buffer1_dev1.nbytes = 2000  # 2000 bytes on device1
+    # Create mock addressable shards with data property
+    mock_data1_dev1 = MagicMock()
+    mock_data1_dev1.device = mock_device1
+    mock_data1_dev1.nbytes = 2000  # 2000 bytes on device1
-    mock_buffer1_dev2 = MagicMock()
-    mock_buffer1_dev2.device = mock_device2
-    mock_buffer1_dev2.nbytes = 2000  # 2000 bytes on device2
+    mock_data1_dev2 = MagicMock()
+    mock_data1_dev2.device = mock_device2
+    mock_data1_dev2.nbytes = 2000  # 2000 bytes on device2
-    mock_buffer2_dev1 = MagicMock()
-    mock_buffer2_dev1.device = mock_device1
-    mock_buffer2_dev1.nbytes = 1000  # 1000 bytes on device1
+    mock_data2_dev1 = MagicMock()
+    mock_data2_dev1.device = mock_device1
+    mock_data2_dev1.nbytes = 1000  # 1000 bytes on device1
-    # Create mock arrays with device buffers
+    mock_shard1_dev1 = MagicMock()
+    mock_shard1_dev1.data = mock_data1_dev1
+    mock_shard1_dev2 = MagicMock()
+    mock_shard1_dev2.data = mock_data1_dev2
+    mock_shard2_dev1 = MagicMock()
+    mock_shard2_dev1.data = mock_data2_dev1
+    # Create mock arrays with addressable_shards
     mock_array1 = MagicMock()
-    mock_array1.device_buffers = [mock_buffer1_dev1, mock_buffer1_dev2]
+    mock_array1.addressable_shards = [mock_shard1_dev1, mock_shard1_dev2]
     mock_array2 = MagicMock()
-    mock_array2.device_buffers = [mock_buffer2_dev1]
+    mock_array2.addressable_shards = [mock_shard2_dev1]
     mock_live_arrays.return_value = [mock_array1, mock_array2]
@@ -159,7 +168,7 @@ def test_hbm_usage_bytes_pathways_no_arrays(mock_devices, mock_live_arrays):
     "head_dim, expected_padded_head_dim",
     [
         (1, 128),
-        (64, 128),
+        (64, 64),
         (127, 128),
         (128, 128),
         (129, 256),

tpu_inference/core/core_tpu.py CHANGED Viewed

@@ -29,6 +29,7 @@ from vllm.v1.request import Request, RequestStatus
 from tpu_inference import utils as common_utils
 from tpu_inference.core import disagg_executor, disagg_utils
+from tpu_inference.runner.tpu_runner import AsyncTPUModelRunnerOutput
 # ======================================================================================
 # Imports for _DisaggOrchestrator (decoupled from vLLM)
 # ======================================================================================
@@ -186,6 +187,8 @@ class _DisaggOrchestrator:
                     if model_output is None:
                         model_output = prefill_engine.model_executor.sample_tokens(
                             grammar_output)
+                    if isinstance(model_output, AsyncTPUModelRunnerOutput):
+                        model_output = model_output.get_output()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 logger.debug(f"Prefill result: {model_output}")
@@ -218,15 +221,16 @@ class _DisaggOrchestrator:
                             f"request-{req_id}: tokens={request.all_token_ids} after prefill"
                         )
                         # Remove request from the prefill engine.
+                        if req_id in prefill_engine.scheduler.requests:
+                            request = prefill_engine.scheduler.requests[req_id]
+                            prefill_engine.scheduler.running.remove(request)
+                            prefill_engine.scheduler.encoder_cache_manager.free(
+                                request)
-                        request = prefill_engine.scheduler.requests[req_id]
-                        prefill_engine.scheduler.running.remove(request)
-                        prefill_engine.scheduler.encoder_cache_manager.free(
-                            request)
+                            prefill_engine.scheduler.kv_cache_manager.free(
+                                request)
-                        prefill_engine.scheduler.kv_cache_manager.free(request)
-                        prefill_engine.scheduler.requests.pop(req_id)
+                            prefill_engine.scheduler.requests.pop(req_id)
                 for output in (engine_core_outputs.items()
                                if engine_core_outputs else ()):
@@ -335,8 +339,10 @@ class _DisaggOrchestrator:
                 new_block_ids = kv_cache_manager.get_block_ids(req_id)
                 logger.debug(
                     f"inserting {req_id} new_block_ids {new_block_ids}")
-                assert (len(new_block_ids[0]) == math.ceil(
-                    prompt_tokens / self._config.cache_config.block_size))
+                if len(new_block_ids[0]) != math.ceil(
+                        prompt_tokens / self._config.cache_config.block_size):
+                    logger.warning("Running out of blocks in decode engine! ")
+                    break
                 decode_engine.model_executor.driver_worker.model_runner.insert_request_with_kv_cache(
                     vllm_request, kv_cache, new_block_ids)
@@ -366,6 +372,8 @@ class _DisaggOrchestrator:
                     if model_output is None:
                         model_output = decode_engine.model_executor.sample_tokens(
                             grammar_output)
+                    if isinstance(model_output, AsyncTPUModelRunnerOutput):
+                        model_output = model_output.get_output()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 logger.debug(f"Decode result: {model_output}")

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -131,10 +131,17 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 f"current platform {current_platform.device_name} does not "
                 "support ray.")
-        placement_group_specs: List[Dict[str, float]] = [{
-            device_str:
-            node['Resources'][device_str]
-        } for node in ray.nodes()]
+        pp_size = self.parallel_config.pipeline_parallel_size
+        placement_group_specs: List[Dict[str, float]] = []
+        if pp_size == 1:
+            placement_group_specs = [{
+                device_str: node['Resources'][device_str]
+            } for node in ray.nodes()]
+        else:
+            num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
+            placement_group_specs = [{
+                device_str: num_devices_per_pp_rank
+            } for _ in range(pp_size)]
         # vLLM engine is also a worker to execute model with an accelerator,
         # so it requires to have the device in a current node. Check if
@@ -329,6 +336,8 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
         all_kwargs = []
         for rank, (node_id, _) in enumerate(worker_node_and_tpu_ids):
             local_rank = node_workers[node_id].index(rank)
+            ip = sorted_worker_metadata[rank].ip
+            prev_ip = sorted_worker_metadata[rank - 1].ip if rank > 0 else ""
             kwargs = dict(
                 vllm_config=self.vllm_config,
                 local_rank=local_rank,
@@ -336,22 +345,26 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 distributed_init_method=distributed_init_method,
                 is_driver_worker=(not self.parallel_config)
                 or (rank % self.parallel_config.tensor_parallel_size == 0),
+                ip=ip,
+                prev_worker_ip=prev_ip,
             )
             all_kwargs.append(kwargs)
         self.collective_rpc("init_worker", args=(all_kwargs, ))
         self.collective_rpc("init_device")
+        if self.parallel_config.pipeline_parallel_size > 1:
+            self._run_workers("initialize_pp_transfer_connect")
         self.collective_rpc("load_model")
         if self.use_ray_spmd_worker:
             for pp_rank in range(self.parallel_config.pipeline_parallel_size):
                 self.pp_tp_workers.append([])
-                for tp_rank in range(
-                        int(self.parallel_config.tensor_parallel_size //
-                            num_tpu_per_worker)):
-                    # PP=2, TP=4
-                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                            ) + tp_rank
+                num_tp_workers = int(
+                    self.parallel_config.tensor_parallel_size //
+                    num_tpu_per_worker)
+                for tp_rank in range(num_tp_workers):
+                    # PP=2, TP=4, num_tpu_per_worker=2
+                    # pp_tp_workers = [[0, 1], [2, 3]]
+                    rank = (pp_rank * num_tp_workers) + tp_rank
                     assert len(self.pp_tp_workers[pp_rank]) == tp_rank
                     assert pp_rank < len(self.pp_tp_workers)
                     self.pp_tp_workers[pp_rank].append(self.workers[rank])

tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py CHANGED Viewed

@@ -317,6 +317,20 @@ def _ragged_paged_attention_kernel(
     q_len = q_end - q_start
     kv_len = kv_lens_ref[seq_idx]
+    bkv_idx_start = 0 if sliding_window is None else jnp.maximum(
+        kv_len - sliding_window, 0) // bkv_sz
+    if sliding_window is None:
+        next_bkv_idx_start = 0
+    else:
+        def get_next_bkv_idx_start():
+            next_kv_len = kv_lens_ref[seq_idx + 1]
+            return jnp.maximum(next_kv_len - sliding_window, 0) // bkv_sz
+        next_bkv_idx_start = lax.cond(seq_idx + 1 < num_seqs,
+                                      get_next_bkv_idx_start, lambda: 0)
     def debug_print(msg, *args):
         if debug_mode:
             pl.debug_print(msg, *args)
@@ -353,8 +367,8 @@ def _ragged_paged_attention_kernel(
         head_acc_ref = acc_ref.at[kv_head_idx, :q.shape[0]]
         def load_with_init(ref, init_val):
-            return jnp.where(bkv_idx == 0, jnp.full_like(ref, init_val),
-                             ref[...])
+            return jnp.where(bkv_idx == bkv_idx_start,
+                             jnp.full_like(ref, init_val), ref[...])
         # Follow FlashAttention-2 forward pass.
         if q_scale is not None:
@@ -378,9 +392,6 @@ def _ragged_paged_attention_kernel(
                   num_q_heads_per_kv_head)
         k_span = bkv_idx * bkv_sz + lax.broadcasted_iota(jnp.int32, s.shape, 1)
         mask = q_span < k_span
-        # TODO(jevinjiang, xiowei): reduce pages_per_seq based on sliding_window.
-        if sliding_window is not None:
-            mask = jnp.logical_or(mask, q_span - sliding_window >= k_span)
         if soft_cap is not None:
             s = soft_cap * jnp.tanh(s / soft_cap)
@@ -391,7 +402,8 @@ def _ragged_paged_attention_kernel(
             sinks = attention_sink_ref[kv_head_idx]
             actual_bq_sz = q.shape[0] // num_q_heads_per_kv_head
             m_prev_init = jnp.concat([sinks] * actual_bq_sz, axis=0)
-            m_prev = jnp.where(bkv_idx == 0, m_prev_init, head_m_ref[...])
+            m_prev = jnp.where(bkv_idx == bkv_idx_start, m_prev_init,
+                               head_m_ref[...])
         else:
             m_prev = load_with_init(head_m_ref, -jnp.inf)
@@ -719,12 +731,19 @@ def _ragged_paged_attention_kernel(
         def get_next_bkv_ids(seq_idx, bq_idx, bkv_idx, bkv_sem_idx):
             next_bkv_idx = bkv_idx + 1
             is_last_bkv = next_bkv_idx == num_bkv
-            next_bkv_idx = lax.select(is_last_bkv, 0, next_bkv_idx)
             next_bq_idx = lax.select(is_last_bkv, bq_idx + 1, bq_idx)
             is_last_bq = next_bq_idx == num_bq
             next_bq_idx = lax.select(is_last_bq, 0, next_bq_idx)
             next_seq_idx = lax.select(is_last_bq, seq_idx + 1, seq_idx)
             next_bkv_sem_idx = lax.select(bkv_sem_idx == 0, 1, 0)
+            next_bkv_idx = lax.select(
+                is_last_bkv,
+                lax.select(
+                    is_last_bq,
+                    next_bkv_idx_start,
+                    bkv_idx_start,
+                ), next_bkv_idx)
             return next_seq_idx, next_bq_idx, next_bkv_idx, next_bkv_sem_idx
         def compute_with_bq(bq_idx, _):
@@ -759,7 +778,7 @@ def _ragged_paged_attention_kernel(
                                     next_bkv_sem_idx)
                 # Wait for cur bq if not ready yet
-                @pl.when(bkv_idx == 0)
+                @pl.when(bkv_idx == bkv_idx_start)
                 def wait_cur_bq():
                     wait_fetch_bq(seq_idx, bq_idx, bq_sem_idx)
@@ -808,7 +827,11 @@ def _ragged_paged_attention_kernel(
                             kv_head_idx=kv_head_idx,
                         )
-            lax.fori_loop(0, num_bkv, compute_with_bkv, None, unroll=False)
+            lax.fori_loop(bkv_idx_start,
+                          num_bkv,
+                          compute_with_bkv,
+                          None,
+                          unroll=False)
             # Load acc and calculate final output.
             acc = acc_ref[...]
@@ -838,7 +861,7 @@ def _ragged_paged_attention_kernel(
     @pl.when(seq_idx == 0)
     def prologue():
         start_fetch_bq(0, 0, 0)
-        start_fetch_bkv(0, 0, 0)
+        start_fetch_bkv(0, bkv_idx_start, 0)
     @pl.when(seq_idx < decode_end)
     def process_decode():

tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py CHANGED Viewed

@@ -1231,6 +1231,13 @@ TUNED_BLOCK_SIZES = {
                 },
             }
         },
+        16: {
+            'q_bfloat16_kv_bfloat16': {
+                'q_head-8_kv_head-1_head-128': {
+                    262144: (128, 256),
+                }
+            }
+        },
     },
     'TPU v5e': {
         128: {

tpu_inference/layers/{jax → common}/attention_interface.py RENAMED Viewed

@@ -17,7 +17,7 @@ import tpu_inference.kernels.ragged_paged_attention.v3.kernel as rpa
 import tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 as rpa_hd64
 from tpu_inference.kernels.flash_attention.kernel import flash_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.sharding import ShardingAxisName
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.utils import get_megacore
 MAX_ALLOWED_PAGE_INDICES_N = (

tpu_inference/layers/common/quant_methods.py ADDED Viewed

@@ -0,0 +1,8 @@
+UNQUANTIZED = "unquantized"
+MXFP4 = "mxfp4"
+AWQ = "awq"
+COMPRESSED_TENSORS = "compressed-tensors"
+def get_tpu_quant_method(quant_method: str) -> str:
+    return "tpu-" + quant_method

tpu_inference/layers/jax/attention/attention.py CHANGED Viewed

@@ -13,9 +13,9 @@ from tpu_inference import utils
 from tpu_inference.kernels.ragged_paged_attention.v3.kernel import \
     ragged_paged_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.base import create_param
 from tpu_inference.layers.jax.rope_interface import apply_rope
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 KVCache = Tuple[jax.Array, jax.Array]

tpu_inference/layers/jax/sample/rejection_sampler.py CHANGED Viewed

@@ -12,7 +12,7 @@ import jax
 import jax.numpy as jnp
 import numpy as np
-from tpu_inference.layers.jax.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.common.binary_search import topk_mask, topp_mask
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata

tpu_inference/layers/jax/sample/sampling.py CHANGED Viewed

@@ -6,10 +6,10 @@ from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 from vllm.v1.outputs import LogprobsTensors
-from tpu_inference.layers.jax.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.common.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 _SAMPLING_EPS = 1e-5

tpu_inference/layers/vllm/attention.py CHANGED Viewed

@@ -13,8 +13,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.logger import init_logger
 from tpu_inference.models.vllm.vllm_model_wrapper_context import \
     get_vllm_model_wrapper_context

tpu_inference/layers/vllm/quantization/__init__.py CHANGED Viewed

@@ -5,10 +5,12 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizationConfig
+from tpu_inference.layers.common import quant_methods
 from tpu_inference.layers.vllm.quantization.awq import VllmAWQConfig
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
     VllmCompressedTensorsConfig  # noqa: E501
+from tpu_inference.layers.vllm.quantization.mxfp4 import VllmMxfp4Config
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedConfig
@@ -19,8 +21,9 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
     # TODO(kyuyeunk): Add support for "tpu_int8".
     method_to_config: dict[str, str] = {
         None: VllmUnquantizedConfig,
-        "compressed-tensors": VllmCompressedTensorsConfig,
-        "awq": VllmAWQConfig,
+        quant_methods.COMPRESSED_TENSORS: VllmCompressedTensorsConfig,
+        quant_methods.AWQ: VllmAWQConfig,
+        quant_methods.MXFP4: VllmMxfp4Config,
     }
     if model_config.quantization not in method_to_config:
         raise NotImplementedError(
@@ -30,6 +33,7 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
     assert issubclass(quant_config, JaxCommonConfig)
     quant_config.set_configs(vllm_config, mesh)
-    model_config.quantization = quant_config.get_name()
+    model_config.quantization = quant_methods.get_tpu_quant_method(
+        quant_config.get_name())
     return VllmConfig.get_quantization_config(model_config,
                                               vllm_config.load_config)

tpu_inference/layers/vllm/quantization/awq.py CHANGED Viewed

@@ -18,6 +18,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped, unpack_quantized_values_into_int32)
 from vllm.scalar_type import scalar_types
+from tpu_inference.layers.common.quant_methods import AWQ, get_tpu_quant_method
 from tpu_inference.layers.vllm.linear_common import (
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
 from tpu_inference.layers.vllm.quantization.common import (
@@ -29,12 +30,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config("jax-awq")
+@register_quantization_config(get_tpu_quant_method(AWQ))
 class VllmAWQConfig(AWQConfig, JaxCommonConfig):
     @classmethod
-    def get_name(cls) -> str:
-        return "jax-awq"
+    def get_name(cls):
+        return AWQ
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
         # NOTE: AWQ checkpoint was quantized with float16. But on TPUs, using

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -16,6 +16,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, should_ignore_layer)
+from tpu_inference.layers.common.quant_methods import (COMPRESSED_TENSORS,
+                                                       get_tpu_quant_method)
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import \
     VllmCompressedTensorsW8A8Fp8MoEMethod
@@ -30,12 +32,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config("jax-compressed-tensors")
+@register_quantization_config(get_tpu_quant_method(COMPRESSED_TENSORS))
 class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
     @classmethod
     def get_name(cls) -> str:
-        return "jax-compressed-tensors"
+        return COMPRESSED_TENSORS
     def get_scheme(self,
                    layer: torch.nn.Module,

tpu-inference 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl