PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (58) hide show

tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +182 -0
tests/test_utils.py +23 -14
tpu_inference/__init__.py +22 -3
tpu_inference/core/core_tpu.py +17 -9
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +2 -3
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +1 -1
tpu_inference/executors/ray_distributed_executor.py +27 -11
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +110 -64
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +7 -0
tpu_inference/layers/{jax → common}/attention_interface.py +1 -1
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/quantization/__init__.py +7 -3
tpu_inference/layers/vllm/quantization/awq.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -2
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +4 -3
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +12 -11
tpu_inference/models/jax/llama3.py +4 -3
tpu_inference/models/jax/llama_eagle3.py +9 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +3 -2
tpu_inference/models/jax/qwen2_5_vl.py +4 -3
tpu_inference/models/jax/qwen3.py +3 -2
tpu_inference/models/jax/utils/weight_utils.py +21 -8
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -10
tpu_inference/platforms/tpu_platform.py +17 -7
tpu_inference/runner/compilation_manager.py +37 -17
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +8 -2
tpu_inference/runner/tpu_runner.py +199 -87
tpu_inference/spec_decode/jax/eagle3.py +2 -1
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +7 -6
tpu_inference/worker/tpu_worker.py +159 -23
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/METADATA +2 -2
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/RECORD +52 -54
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
/tpu_inference/layers/{jax → common}/binary_search.py +0 -0
/tpu_inference/layers/{jax → common}/sharding.py +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/top_level.txt +0 -0

tpu_inference/kernels/ragged_paged_attention/v3/kernel.py CHANGED Viewed

@@ -440,42 +440,54 @@ def _ragged_paged_attention_kernel(
         debug_print("[RPA debug] bkv_sz_frm_new={}", bkv_sz_frm_new)
         debug_print("[RPA debug] page_indices_offset={}", page_indices_offset)
-        # Fetch effective kv from kv cache.
-        def loop_body(i, offset):
-            sz = jnp.minimum(page_size, kv_left_frm_cache - i * page_size)
-            _async_copy(
-                cache_hbm_ref.at[pl.ds(
-                    page_indices_ref[page_indices_offset + i] * page_size,
-                    sz)],
-                vmem_ref.at[pl.ds(i * page_size, sz)],
-                sem,
-                wait,
+        if not wait:
+            # Fetch effective kv from kv cache.
+            def loop_body(i, offset):
+                sz = jnp.minimum(page_size, kv_left_frm_cache - i * page_size)
+                _async_copy(
+                    cache_hbm_ref.at[pl.ds(
+                        page_indices_ref[page_indices_offset + i] * page_size,
+                        sz)],
+                    vmem_ref.at[pl.ds(i * page_size, sz)],
+                    sem,
+                    wait=False,
+                )
+                debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
+                return offset + sz
+            offset = lax.fori_loop(
+                0,
+                bkv_p_frm_cache,
+                loop_body,
+                0,  # offset
+                unroll=False,
             )
-            debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
-            return offset + sz
-        offset = lax.fori_loop(
-            0,
-            bkv_p_frm_cache,
-            loop_body,
-            0,  # offset
-            unroll=False,
-        )
-        # Fetch kv directly from new kv.
-        @pl.when(bkv_sz_frm_new > 0)
-        def _fetch_bkv_from_new_kv():
-            new_kv_len_start = q_end - kv_left_frm_new
-            debug_print("[RPA debug] new_kv_len_start={}", new_kv_len_start)
-            debug_print("[RPA debug] offset_in_bkv={}", offset)
+            # Fetch kv directly from new kv.
+            @pl.when(bkv_sz_frm_new > 0)
+            def _fetch_bkv_from_new_kv():
+                new_kv_len_start = q_end - kv_left_frm_new
+                debug_print("[RPA debug] new_kv_len_start={}",
+                            new_kv_len_start)
+                debug_print("[RPA debug] offset_in_bkv={}", offset)
+                _async_copy(
+                    kv_hbm_ref.at[pl.ds(new_kv_len_start, bkv_sz_frm_new)],
+                    vmem_ref.at[pl.ds(offset, bkv_sz_frm_new)],
+                    sem,
+                    wait,
+                )
+            return kv_len_start + offset, bkv_sz_frm_new
+        else:
+            offset = jnp.minimum(kv_left_frm_cache, page_size * bkv_p)
+            dst = vmem_ref.at[pl.ds(0, offset + bkv_sz_frm_new)]
             _async_copy(
-                kv_hbm_ref.at[pl.ds(new_kv_len_start, bkv_sz_frm_new)],
-                vmem_ref.at[pl.ds(offset, bkv_sz_frm_new)],
-                sem,
-                wait,
+                src=dst,
+                dst=dst,
+                sem=sem,
+                wait=True,
             )
-        return kv_len_start + offset, bkv_sz_frm_new
+            return kv_len_start + offset, bkv_sz_frm_new
     def _update_kv_cache(seq_idx,
                          bkv_sem_idx,
@@ -511,30 +523,41 @@ def _ragged_paged_attention_kernel(
         debug_print("[RPA debug] p_ignore={}", p_ignore)
         debug_print("[RPA debug] page_indices_offset={}", page_indices_offset)
-        def loop_body(i, states):
-            update_sz, ignore = states
-            sz = jnp.minimum(page_size - ignore, update_sz)
+        if not wait:
+            def loop_body(i, states):
+                update_sz, ignore = states
+                sz = jnp.minimum(page_size - ignore, update_sz)
+                _async_copy(
+                    vmem_ref.at[pl.ds((p_ignore + i) * page_size + ignore,
+                                      sz)],
+                    cache_hbm_ref.at[pl.ds(
+                        page_indices_ref[page_indices_offset + i] * page_size +
+                        ignore,
+                        sz,
+                    )],
+                    sem,
+                    wait=False,
+                )
+                debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
+                return update_sz - sz, 0
+            lax.fori_loop(
+                0,
+                kv_p_end - kv_p_start,
+                loop_body,
+                (update_sz, ignore),  # total transfer size
+                unroll=False,
+            )
+        else:
+            dst = cache_hbm_ref.at[pl.ds(0, update_sz)]
             _async_copy(
-                vmem_ref.at[pl.ds((p_ignore + i) * page_size + ignore, sz)],
-                cache_hbm_ref.at[pl.ds(
-                    page_indices_ref[page_indices_offset + i] * page_size +
-                    ignore,
-                    sz,
-                )],
-                sem,
-                wait,
+                src=dst,
+                dst=dst,
+                sem=sem,
+                wait=True,
             )
-            debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
-            return update_sz - sz, 0
-        lax.fori_loop(
-            0,
-            kv_p_end - kv_p_start,
-            loop_body,
-            (update_sz, ignore),  # total transfer size
-            unroll=False,
-        )
     def _fetch_bq(seq_idx, bq_idx, bq_sem_idx, *, wait=False):
         sem = sems.at[1, bq_sem_idx]

tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py CHANGED Viewed

@@ -317,6 +317,20 @@ def _ragged_paged_attention_kernel(
     q_len = q_end - q_start
     kv_len = kv_lens_ref[seq_idx]
+    bkv_idx_start = 0 if sliding_window is None else jnp.maximum(
+        kv_len - sliding_window, 0) // bkv_sz
+    if sliding_window is None:
+        next_bkv_idx_start = 0
+    else:
+        def get_next_bkv_idx_start():
+            next_kv_len = kv_lens_ref[seq_idx + 1]
+            return jnp.maximum(next_kv_len - sliding_window, 0) // bkv_sz
+        next_bkv_idx_start = lax.cond(seq_idx + 1 < num_seqs,
+                                      get_next_bkv_idx_start, lambda: 0)
     def debug_print(msg, *args):
         if debug_mode:
             pl.debug_print(msg, *args)
@@ -353,8 +367,8 @@ def _ragged_paged_attention_kernel(
         head_acc_ref = acc_ref.at[kv_head_idx, :q.shape[0]]
         def load_with_init(ref, init_val):
-            return jnp.where(bkv_idx == 0, jnp.full_like(ref, init_val),
-                             ref[...])
+            return jnp.where(bkv_idx == bkv_idx_start,
+                             jnp.full_like(ref, init_val), ref[...])
         # Follow FlashAttention-2 forward pass.
         if q_scale is not None:
@@ -378,9 +392,6 @@ def _ragged_paged_attention_kernel(
                   num_q_heads_per_kv_head)
         k_span = bkv_idx * bkv_sz + lax.broadcasted_iota(jnp.int32, s.shape, 1)
         mask = q_span < k_span
-        # TODO(jevinjiang, xiowei): reduce pages_per_seq based on sliding_window.
-        if sliding_window is not None:
-            mask = jnp.logical_or(mask, q_span - sliding_window >= k_span)
         if soft_cap is not None:
             s = soft_cap * jnp.tanh(s / soft_cap)
@@ -391,7 +402,8 @@ def _ragged_paged_attention_kernel(
             sinks = attention_sink_ref[kv_head_idx]
             actual_bq_sz = q.shape[0] // num_q_heads_per_kv_head
             m_prev_init = jnp.concat([sinks] * actual_bq_sz, axis=0)
-            m_prev = jnp.where(bkv_idx == 0, m_prev_init, head_m_ref[...])
+            m_prev = jnp.where(bkv_idx == bkv_idx_start, m_prev_init,
+                               head_m_ref[...])
         else:
             m_prev = load_with_init(head_m_ref, -jnp.inf)
@@ -463,42 +475,54 @@ def _ragged_paged_attention_kernel(
         debug_print("[RPA debug] bkv_sz_frm_new={}", bkv_sz_frm_new)
         debug_print("[RPA debug] page_indices_offset={}", page_indices_offset)
-        # Fetch effective kv from kv cache.
-        def loop_body(i, offset):
-            sz = jnp.minimum(page_size, kv_left_frm_cache - i * page_size)
-            _async_copy(
-                cache_hbm_ref.at[pl.ds(
-                    page_indices_ref[page_indices_offset + i] * page_size,
-                    sz)],
-                vmem_ref.at[pl.ds(i * page_size, sz)],
-                sem,
-                wait,
+        if not wait:
+            # Fetch effective kv from kv cache.
+            def loop_body(i, offset):
+                sz = jnp.minimum(page_size, kv_left_frm_cache - i * page_size)
+                _async_copy(
+                    cache_hbm_ref.at[pl.ds(
+                        page_indices_ref[page_indices_offset + i] * page_size,
+                        sz)],
+                    vmem_ref.at[pl.ds(i * page_size, sz)],
+                    sem,
+                    wait=False,
+                )
+                debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
+                return offset + sz
+            offset = lax.fori_loop(
+                0,
+                bkv_p_frm_cache,
+                loop_body,
+                0,  # offset
+                unroll=False,
             )
-            debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
-            return offset + sz
-        offset = lax.fori_loop(
-            0,
-            bkv_p_frm_cache,
-            loop_body,
-            0,  # offset
-            unroll=False,
-        )
-        # Fetch kv directly from new kv.
-        @pl.when(bkv_sz_frm_new > 0)
-        def _fetch_bkv_from_new_kv():
-            new_kv_len_start = q_end - kv_left_frm_new
-            debug_print("[RPA debug] new_kv_len_start={}", new_kv_len_start)
-            debug_print("[RPA debug] offset_in_bkv={}", offset)
+            # Fetch kv directly from new kv.
+            @pl.when(bkv_sz_frm_new > 0)
+            def _fetch_bkv_from_new_kv():
+                new_kv_len_start = q_end - kv_left_frm_new
+                debug_print("[RPA debug] new_kv_len_start={}",
+                            new_kv_len_start)
+                debug_print("[RPA debug] offset_in_bkv={}", offset)
+                _async_copy(
+                    kv_hbm_ref.at[pl.ds(new_kv_len_start, bkv_sz_frm_new)],
+                    vmem_ref.at[pl.ds(offset, bkv_sz_frm_new)],
+                    sem,
+                    wait,
+                )
+            return kv_len_start + offset, bkv_sz_frm_new
+        else:
+            offset = jnp.minimum(kv_left_frm_cache, page_size * bkv_p)
+            dst = vmem_ref.at[pl.ds(0, offset + bkv_sz_frm_new)]
             _async_copy(
-                kv_hbm_ref.at[pl.ds(new_kv_len_start, bkv_sz_frm_new)],
-                vmem_ref.at[pl.ds(offset, bkv_sz_frm_new)],
-                sem,
-                wait,
+                src=dst,
+                dst=dst,
+                sem=sem,
+                wait=True,
             )
-        return kv_len_start + offset, bkv_sz_frm_new
+            return kv_len_start + offset, bkv_sz_frm_new
     def _update_kv_cache(seq_idx,
                          bkv_sem_idx,
@@ -534,30 +558,41 @@ def _ragged_paged_attention_kernel(
         debug_print("[RPA debug] p_ignore={}", p_ignore)
         debug_print("[RPA debug] page_indices_offset={}", page_indices_offset)
-        def loop_body(i, states):
-            update_sz, ignore = states
-            sz = jnp.minimum(page_size - ignore, update_sz)
+        if not wait:
+            def loop_body(i, states):
+                update_sz, ignore = states
+                sz = jnp.minimum(page_size - ignore, update_sz)
+                _async_copy(
+                    vmem_ref.at[pl.ds((p_ignore + i) * page_size + ignore,
+                                      sz)],
+                    cache_hbm_ref.at[pl.ds(
+                        page_indices_ref[page_indices_offset + i] * page_size +
+                        ignore,
+                        sz,
+                    )],
+                    sem,
+                    wait=False,
+                )
+                debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
+                return update_sz - sz, 0
+            lax.fori_loop(
+                0,
+                kv_p_end - kv_p_start,
+                loop_body,
+                (update_sz, ignore),  # total transfer size
+                unroll=False,
+            )
+        else:
+            dst = cache_hbm_ref.at[pl.ds(0, update_sz)]
             _async_copy(
-                vmem_ref.at[pl.ds((p_ignore + i) * page_size + ignore, sz)],
-                cache_hbm_ref.at[pl.ds(
-                    page_indices_ref[page_indices_offset + i] * page_size +
-                    ignore,
-                    sz,
-                )],
-                sem,
-                wait,
+                src=dst,
+                dst=dst,
+                sem=sem,
+                wait=True,
             )
-            debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
-            return update_sz - sz, 0
-        lax.fori_loop(
-            0,
-            kv_p_end - kv_p_start,
-            loop_body,
-            (update_sz, ignore),  # total transfer size
-            unroll=False,
-        )
     def _fetch_bq(seq_idx, bq_idx, bq_sem_idx, *, wait=False):
         sem = sems.at[1, bq_sem_idx]
@@ -719,12 +754,19 @@ def _ragged_paged_attention_kernel(
         def get_next_bkv_ids(seq_idx, bq_idx, bkv_idx, bkv_sem_idx):
             next_bkv_idx = bkv_idx + 1
             is_last_bkv = next_bkv_idx == num_bkv
-            next_bkv_idx = lax.select(is_last_bkv, 0, next_bkv_idx)
             next_bq_idx = lax.select(is_last_bkv, bq_idx + 1, bq_idx)
             is_last_bq = next_bq_idx == num_bq
             next_bq_idx = lax.select(is_last_bq, 0, next_bq_idx)
             next_seq_idx = lax.select(is_last_bq, seq_idx + 1, seq_idx)
             next_bkv_sem_idx = lax.select(bkv_sem_idx == 0, 1, 0)
+            next_bkv_idx = lax.select(
+                is_last_bkv,
+                lax.select(
+                    is_last_bq,
+                    next_bkv_idx_start,
+                    bkv_idx_start,
+                ), next_bkv_idx)
             return next_seq_idx, next_bq_idx, next_bkv_idx, next_bkv_sem_idx
         def compute_with_bq(bq_idx, _):
@@ -759,7 +801,7 @@ def _ragged_paged_attention_kernel(
                                     next_bkv_sem_idx)
                 # Wait for cur bq if not ready yet
-                @pl.when(bkv_idx == 0)
+                @pl.when(bkv_idx == bkv_idx_start)
                 def wait_cur_bq():
                     wait_fetch_bq(seq_idx, bq_idx, bq_sem_idx)
@@ -808,7 +850,11 @@ def _ragged_paged_attention_kernel(
                             kv_head_idx=kv_head_idx,
                         )
-            lax.fori_loop(0, num_bkv, compute_with_bkv, None, unroll=False)
+            lax.fori_loop(bkv_idx_start,
+                          num_bkv,
+                          compute_with_bkv,
+                          None,
+                          unroll=False)
             # Load acc and calculate final output.
             acc = acc_ref[...]
@@ -838,7 +884,7 @@ def _ragged_paged_attention_kernel(
     @pl.when(seq_idx == 0)
     def prologue():
         start_fetch_bq(0, 0, 0)
-        start_fetch_bkv(0, 0, 0)
+        start_fetch_bkv(0, bkv_idx_start, 0)
     @pl.when(seq_idx < decode_end)
     def process_decode():

tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py CHANGED Viewed

@@ -1231,6 +1231,13 @@ TUNED_BLOCK_SIZES = {
                 },
             }
         },
+        16: {
+            'q_bfloat16_kv_bfloat16': {
+                'q_head-8_kv_head-1_head-128': {
+                    262144: (128, 256),
+                }
+            }
+        },
     },
     'TPU v5e': {
         128: {

tpu_inference/layers/{jax → common}/attention_interface.py RENAMED Viewed

@@ -17,7 +17,7 @@ import tpu_inference.kernels.ragged_paged_attention.v3.kernel as rpa
 import tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 as rpa_hd64
 from tpu_inference.kernels.flash_attention.kernel import flash_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.sharding import ShardingAxisName
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.utils import get_megacore
 MAX_ALLOWED_PAGE_INDICES_N = (

tpu_inference/layers/common/quant_methods.py ADDED Viewed

@@ -0,0 +1,8 @@
+UNQUANTIZED = "unquantized"
+MXFP4 = "mxfp4"
+AWQ = "awq"
+COMPRESSED_TENSORS = "compressed-tensors"
+def get_tpu_quant_method(quant_method: str) -> str:
+    return "tpu-" + quant_method

tpu_inference/layers/jax/attention/attention.py CHANGED Viewed

@@ -13,9 +13,9 @@ from tpu_inference import utils
 from tpu_inference.kernels.ragged_paged_attention.v3.kernel import \
     ragged_paged_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.base import create_param
 from tpu_inference.layers.jax.rope_interface import apply_rope
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 KVCache = Tuple[jax.Array, jax.Array]

tpu_inference/layers/jax/sample/rejection_sampler.py CHANGED Viewed

@@ -12,7 +12,7 @@ import jax
 import jax.numpy as jnp
 import numpy as np
-from tpu_inference.layers.jax.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.common.binary_search import topk_mask, topp_mask
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata

tpu_inference/layers/jax/sample/sampling.py CHANGED Viewed

@@ -6,10 +6,10 @@ from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 from vllm.v1.outputs import LogprobsTensors
-from tpu_inference.layers.jax.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.common.binary_search import topk_mask, topp_mask
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata
-from tpu_inference.layers.jax.sharding import ShardingAxisName
 _SAMPLING_EPS = 1e-5

tpu_inference/layers/vllm/attention.py CHANGED Viewed

@@ -13,8 +13,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.logger import init_logger
 from tpu_inference.models.vllm.vllm_model_wrapper_context import \
     get_vllm_model_wrapper_context

tpu_inference/layers/vllm/quantization/__init__.py CHANGED Viewed

@@ -5,10 +5,12 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizationConfig
+from tpu_inference.layers.common import quant_methods
 from tpu_inference.layers.vllm.quantization.awq import VllmAWQConfig
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
     VllmCompressedTensorsConfig  # noqa: E501
+from tpu_inference.layers.vllm.quantization.mxfp4 import VllmMxfp4Config
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedConfig
@@ -19,8 +21,9 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
     # TODO(kyuyeunk): Add support for "tpu_int8".
     method_to_config: dict[str, str] = {
         None: VllmUnquantizedConfig,
-        "compressed-tensors": VllmCompressedTensorsConfig,
-        "awq": VllmAWQConfig,
+        quant_methods.COMPRESSED_TENSORS: VllmCompressedTensorsConfig,
+        quant_methods.AWQ: VllmAWQConfig,
+        quant_methods.MXFP4: VllmMxfp4Config,
     }
     if model_config.quantization not in method_to_config:
         raise NotImplementedError(
@@ -30,6 +33,7 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
     assert issubclass(quant_config, JaxCommonConfig)
     quant_config.set_configs(vllm_config, mesh)
-    model_config.quantization = quant_config.get_name()
+    model_config.quantization = quant_methods.get_tpu_quant_method(
+        quant_config.get_name())
     return VllmConfig.get_quantization_config(model_config,
                                               vllm_config.load_config)

tpu_inference/layers/vllm/quantization/awq.py CHANGED Viewed

@@ -18,6 +18,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped, unpack_quantized_values_into_int32)
 from vllm.scalar_type import scalar_types
+from tpu_inference.layers.common.quant_methods import AWQ, get_tpu_quant_method
 from tpu_inference.layers.vllm.linear_common import (
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
 from tpu_inference.layers.vllm.quantization.common import (
@@ -29,12 +30,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config("jax-awq")
+@register_quantization_config(get_tpu_quant_method(AWQ))
 class VllmAWQConfig(AWQConfig, JaxCommonConfig):
     @classmethod
-    def get_name(cls) -> str:
-        return "jax-awq"
+    def get_name(cls):
+        return AWQ
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
         # NOTE: AWQ checkpoint was quantized with float16. But on TPUs, using

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -16,6 +16,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, should_ignore_layer)
+from tpu_inference.layers.common.quant_methods import (COMPRESSED_TENSORS,
+                                                       get_tpu_quant_method)
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import \
     VllmCompressedTensorsW8A8Fp8MoEMethod
@@ -30,12 +32,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config("jax-compressed-tensors")
+@register_quantization_config(get_tpu_quant_method(COMPRESSED_TENSORS))
 class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
     @classmethod
     def get_name(cls) -> str:
-        return "jax-compressed-tensors"
+        return COMPRESSED_TENSORS
     def get_scheme(self,
                    layer: torch.nn.Module,

tpu-inference 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl