PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (59) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -1
tests/lora/test_lora_perf.py +53 -0
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/distributed/tpu_connector.py +1 -1
tpu_inference/envs.py +92 -8
tpu_inference/executors/ray_distributed_executor.py +5 -1
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +82 -32
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +146 -85
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/models/common/model_loader.py +78 -22
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama_eagle3.py +4 -5
tpu_inference/models/jax/qwen2_5_vl.py +161 -47
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +203 -155
tpu_inference/models/vllm/vllm_model_wrapper.py +11 -5
tpu_inference/platforms/tpu_platform.py +29 -48
tpu_inference/runner/compilation_manager.py +112 -46
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +40 -31
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +94 -51
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -22
tpu_inference/utils.py +41 -14
tpu_inference/worker/tpu_worker.py +43 -45
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +8 -9
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +59 -58
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/kernels/mla/v1/kernel.py CHANGED Viewed

@@ -16,17 +16,30 @@ DEFAULT_MASK_VALUE = -0.7 * float(jnp.finfo(jnp.dtype("float32")).max)
 DEFAULT_VMEM_LIMIT_BYTES = 100 * 1024 * 1024
+def get_kv_cache_shape(
+    total_num_pages,
+    page_size,
+    kv_dim,
+    kv_dtype,
+):
+    kv_packing = get_dtype_packing(kv_dtype)
+    return (
+        total_num_pages,
+        align_to(page_size, kv_packing) // kv_packing,
+        kv_packing,
+        align_to(kv_dim, 128),
+    )
 @functools.partial(
     jax.jit,
-    donate_argnames=("cache_kv_c", "cache_k_pe"),
+    donate_argnames=("cache_kv"),
 )
 def update_kv_cache(
         new_kv_c: jax.Array,  # [num_tokens, actual_lkv_dim]
         new_k_pe: jax.Array,  # [num_tokens, actual_r_dim]
-        cache_kv_c: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-        cache_k_pe: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
+        cache_kv: jax.
+    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim+r_dim]
         kv_lens: jax.Array,  # i32[max_num_seqs]
         page_indices: jax.Array,  # i32[max_num_seqs * pages_per_seq]
         cu_q_lens: jax.Array,  # i32[max_num_seqs + 1]
@@ -43,25 +56,21 @@ def update_kv_cache(
     if actual_lkv_dim != lkv_dim:
         new_kv_c = jnp.pad(new_kv_c, ((0, 0), (0, lkv_dim - actual_lkv_dim)),
                            constant_values=0)
-    _, page_size_per_kv_packing, kv_packing, cache_lkv_dim = cache_kv_c.shape
-    _, _, _, cache_r_dim = cache_k_pe.shape
-    assert lkv_dim == cache_lkv_dim
-    assert r_dim == cache_r_dim
+    kv_dim = r_dim + lkv_dim
+    _, page_size_per_kv_packing, kv_packing, cache_kv_dim = cache_kv.shape
+    assert kv_dim == cache_kv_dim
     page_size = page_size_per_kv_packing * kv_packing
     max_num_seqs = kv_lens.shape[0]
     num_page_indices = page_indices.shape[0]
     pages_per_seq = num_page_indices // max_num_seqs
-    def seq_loop_body(i, caches):
-        cache_kv_c, cache_k_pe = caches
+    def seq_loop_body(i, cache_kv):
         q_start, q_end = cu_q_lens[i], cu_q_lens[i + 1]
         q_len = q_end - q_start
         kv_len = kv_lens[i]
-        def token_loop_body(j, caches_):
-            cache_kv_c_, cache_k_pe_ = caches_
+        def token_loop_body(j, cache_kv_):
             token_idx_in_seq = kv_len - q_len + j
             page_num_in_seq = token_idx_in_seq // page_size
             page_indices_start = i * pages_per_seq
@@ -69,18 +78,17 @@ def update_kv_cache(
             row = (token_idx_in_seq % page_size) // kv_packing
             col = (token_idx_in_seq % page_size) % kv_packing
-            cache_kv_c_ = cache_kv_c_.at[page_idx, row,
-                                         col].set(new_kv_c[q_start + j])
-            cache_k_pe_ = cache_k_pe_.at[page_idx, row,
-                                         col].set(new_k_pe[q_start + j])
-            return cache_kv_c_, cache_k_pe_
+            cache_kv_ = cache_kv_.at[page_idx, row, col,
+                                     ..., :lkv_dim].set(new_kv_c[q_start + j])
+            cache_kv_ = cache_kv_.at[page_idx, row, col, ...,
+                                     lkv_dim:].set(new_k_pe[q_start + j])
+            return cache_kv_
+        return lax.fori_loop(0, q_len, token_loop_body, cache_kv)
-        return lax.fori_loop(0, q_len, token_loop_body,
-                             (cache_kv_c, cache_k_pe))
+    cache_kv = lax.fori_loop(0, distribution[-1], seq_loop_body, cache_kv)
-    cache_kv_c, cache_k_pe = lax.fori_loop(0, distribution[-1], seq_loop_body,
-                                           (cache_kv_c, cache_k_pe))
-    return cache_kv_c, cache_k_pe
+    return cache_kv
 def ref_mla_ragged_paged_attention(
@@ -88,10 +96,8 @@ def ref_mla_ragged_paged_attention(
     q_pe: jax.Array,  # [num_tokens, actual_num_q_heads, actual_r_dim]
     new_kv_c: jax.Array,  # [num_tokens, actual_lkv_dim]
     new_k_pe: jax.Array,  # [num_tokens, actual_r_dim]
-    cache_kv_c: jax.
+    cache_kv: jax.
     Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-    cache_k_pe: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
     kv_lens: jax.Array,  # i32[max_num_seqs]
     page_indices: jax.Array,  # i32[max_num_seqs * pages_per_seq]
     cu_q_lens: jax.Array,  # i32[max_num_seqs + 1]
@@ -111,8 +117,7 @@ def ref_mla_ragged_paged_attention(
         q_pe,
         new_kv_c,
         new_k_pe,
-        cache_kv_c,
-        cache_k_pe,
+        cache_kv,
         kv_lens,
         page_indices,
         cu_q_lens,
@@ -123,11 +128,10 @@ def ref_mla_ragged_paged_attention(
         mask_value=mask_value,
     )
-    cache_kv_c, cache_k_pe = update_kv_cache(
+    updated_cache_kv = update_kv_cache(
         new_kv_c,
         new_k_pe,
-        cache_kv_c,
-        cache_k_pe,
+        cache_kv,
         kv_lens,
         page_indices,
         cu_q_lens,
@@ -154,13 +158,17 @@ def ref_mla_ragged_paged_attention(
     assert num_page_indices % max_num_seqs == 0
     pages_per_seq = num_page_indices // max_num_seqs
-    total_num_pages, page_size_per_kv_packing, kv_packing, _ = cache_kv_c.shape
+    total_num_pages, page_size_per_kv_packing, kv_packing, _ = updated_cache_kv.shape
     page_size = page_size_per_kv_packing * kv_packing
     assert lkv_dim == ql_nope.shape[-1]
     assert r_dim == q_pe.shape[-1]
+    assert lkv_dim + r_dim == updated_cache_kv.shape[-1]
-    kv_c_cache = cache_kv_c.reshape(total_num_pages, page_size, lkv_dim)
-    k_pe_cache = cache_k_pe.reshape(total_num_pages, page_size, r_dim)
+    kv_c_cache = updated_cache_kv[..., :lkv_dim].reshape(
+        total_num_pages, page_size, lkv_dim)
+    k_pe_cache = updated_cache_kv[...,
+                                  lkv_dim:].reshape(total_num_pages, page_size,
+                                                    r_dim)
     outputs = []
@@ -221,8 +229,7 @@ def ref_mla_ragged_paged_attention(
     return (
         jnp.concatenate(outputs, axis=0),
-        cache_kv_c,
-        cache_k_pe,
+        updated_cache_kv,
     )
@@ -232,10 +239,8 @@ def dynamic_validate_inputs(
     q_pe: jax.Array,  # [max_num_tokens, actual_num_q_heads, actual_r_dim]
     new_kv_c: jax.Array,  # [max_num_tokens, actual_lkv_dim]
     new_k_pe: jax.Array,  # [max_num_tokens, actual_r_dim]
-    cache_kv_c: jax.
+    cache_kv: jax.
     Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-    cache_k_pe: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
     kv_lens: jax.Array,  # i32[max_num_seqs]
     page_indices: jax.Array,  # i32[max_num_seqs * pages_per_seq]
     cu_q_lens: jax.Array,  # i32[max_num_seqs + 1]
@@ -260,8 +265,7 @@ def dynamic_validate_inputs(
         q_pe,
         new_kv_c,
         new_k_pe,
-        cache_kv_c,
-        cache_k_pe,
+        cache_kv,
         kv_lens,
         page_indices,
         cu_q_lens,
@@ -277,8 +281,8 @@ def dynamic_validate_inputs(
         debug_mode=debug_mode,
     )
     max_num_tokens = ql_nope.shape[0]
-    total_num_pages = cache_kv_c.shape[0]
-    _, page_size_per_kv_packing, kv_packing, _ = cache_kv_c.shape
+    total_num_pages = cache_kv.shape[0]
+    _, page_size_per_kv_packing, kv_packing, _ = cache_kv.shape
     page_size = page_size_per_kv_packing * kv_packing
     max_num_seqs = kv_lens.shape[0]
     num_page_indices = page_indices.shape[0]
@@ -320,10 +324,8 @@ def static_validate_inputs(
     q_pe: jax.Array,  # [max_num_tokens, actual_num_q_heads, actual_r_dim]
     new_kv_c: jax.Array,  # [max_num_tokens, actual_lkv_dim]
     new_k_pe: jax.Array,  # [max_num_tokens, actual_r_dim]
-    cache_kv_c: jax.
+    cache_kv: jax.
     Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-    cache_k_pe: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
     kv_lens: jax.Array,  # i32[max_num_seqs]
     page_indices: jax.Array,  # i32[max_num_seqs * pages_per_seq]
     cu_q_lens: jax.Array,  # i32[max_num_seqs + 1]
@@ -373,44 +375,34 @@ def static_validate_inputs(
     actual_lkv_dim = ql_nope.shape[2]
     actual_r_dim = q_pe.shape[2]
+    lkv_dim = align_to(actual_lkv_dim, 128)
+    r_dim = align_to(actual_r_dim, 128)
     (
         _,
         page_size_per_kv_packing,
         kv_packing,
-        lkv_dim,
-    ) = cache_kv_c.shape
-    _, _, _, r_dim = cache_k_pe.shape
+        kv_dim,
+    ) = cache_kv.shape
-    if lkv_dim != align_to(actual_lkv_dim, 128):
-        raise ValueError(
-            f"Expected {lkv_dim=} is equal to {align_to(actual_lkv_dim, 128)=}"
-        )
-    if r_dim != align_to(actual_r_dim, 128):
+    if lkv_dim + r_dim != kv_dim:
         raise ValueError(
-            f"Expected {r_dim=} is equal to {align_to(actual_r_dim, 128)=}")
+            f"Expected {lkv_dim=} + {r_dim=} to be equal to {kv_dim=}")
-    if not (cache_kv_c.dtype == new_kv_c.dtype):
+    if not (cache_kv.dtype == new_kv_c.dtype):
         raise ValueError(
-            f"Expected {cache_kv_c.dtype=} to be equal to {new_kv_c.dtype=}.")
-    if not (cache_k_pe.dtype == new_k_pe.dtype):
+            f"Expected {cache_kv.dtype=} to be equal to {new_kv_c.dtype=}.")
+    if not (cache_kv.dtype == new_k_pe.dtype):
         raise ValueError(
-            f"Expected {cache_k_pe.dtype=} to be equal to {new_k_pe.dtype=}.")
+            f"Expected {cache_kv.dtype=} to be equal to {new_k_pe.dtype=}.")
     # Integer kv quantization is currently not supported.
-    if not jnp.issubdtype(cache_kv_c.dtype, jnp.floating):
-        raise ValueError(
-            f"Expected {cache_kv_c.dtype=} to be a floating point.")
-    if not jnp.issubdtype(cache_k_pe.dtype, jnp.floating):
-        raise ValueError(
-            f"Expected {cache_k_pe.dtype=} to be a floating point.")
+    if not jnp.issubdtype(cache_kv.dtype, jnp.floating):
+        raise ValueError(f"Expected {cache_kv.dtype=} to be a floating point.")
-    if kv_packing != get_dtype_packing(cache_kv_c.dtype):
+    if kv_packing != get_dtype_packing(cache_kv.dtype):
         raise ValueError(
-            f"{kv_packing=} does not match with {cache_kv_c.dtype=}")
-    if kv_packing != get_dtype_packing(cache_k_pe.dtype):
-        raise ValueError(
-            f"{kv_packing=} does not match with {cache_k_pe.dtype=}")
+            f"{kv_packing=} does not match with {cache_kv.dtype=}")
     if not (jnp.int32 == kv_lens.dtype == page_indices.dtype == cu_q_lens.dtype
             == distribution.dtype):
@@ -475,14 +467,12 @@ def _mla_ragged_paged_attention_kernel(
     q_pe_hbm_ref,  # [max_num_tokens, num_q_heads_per_q_packing, q_packing, r_dim]
     new_kv_c_hbm_ref,  # [max_num_tokens_per_kv_packing, kv_packing, lkv_dim]
     new_k_pe_hbm_ref,  # [max_num_tokens_per_kv_packing, kv_packing, r_dim]
-    cache_kv_c_hbm_ref,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-    cache_k_pe_hbm_ref,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
+    cache_kv_hbm_ref,  # [total_num_pages, page_size_per_kv_packing, kv_packing, align_to(lkv_dim + r_dim, 128)]
     # Output
     o_hbm_ref,  # [max_num_tokens, num_q_heads_per_q_packing, q_packing, lkv_dim]
-    updated_cache_kv_c_hbm_ref,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-    updated_cache_k_pe_hbm_ref,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
+    updated_cache_kv_hbm_ref,  # [total_num_pages, page_size_per_kv_packing, kv_packing, align_to(lkv_dim + r_dim, 128)]
     # Scratch
-    bkvc_x2_ref,  # [2, bkv_sz_per_kv_packing, kv_packing, lkv_dim]
+    bkvc_x2_ref,  # [2, bkv_sz_per_kv_packing, kv_packing, lkv_dim].
     bkpe_x2_ref,  # [2, bkv_sz_per_kv_packing, kv_packing, r_dim]
     bq_nope_x2_ref,  # [2, bq_sz, num_q_heads_per_q_packing, q_packing, lkv_dim]
     bq_rope_x2_ref,  # [2, bq_sz, num_q_heads_per_q_packing, q_packing, r_dim]
@@ -505,20 +495,24 @@ def _mla_ragged_paged_attention_kernel(
     debug_mode: bool = False,
 ):
     assert ql_nope_hbm_ref.shape == o_hbm_ref.shape
-    assert ql_nope_hbm_ref.shape[-1] == cache_kv_c_hbm_ref.shape[-1]
-    assert q_pe_hbm_ref.shape[-1] == cache_k_pe_hbm_ref.shape[-1]
+    # Validation checks on the dimensions
+    nope_dim = ql_nope_hbm_ref.shape[-1]
+    pe_dim = q_pe_hbm_ref.shape[-1]
+    assert nope_dim + pe_dim == cache_kv_hbm_ref.shape[-1]
     _, num_q_heads_per_q_packing, q_packing, lkv_dim = ql_nope_hbm_ref.shape
     r_dim = q_pe_hbm_ref.shape[-1]
     num_q_heads = num_q_heads_per_q_packing * q_packing
     total_num_pages, page_size_per_kv_packing, kv_packing, _ = (
-        cache_kv_c_hbm_ref.shape)
+        cache_kv_hbm_ref.shape)
     max_num_seqs = kv_lens_ref.shape[0]
     num_page_indices = page_indices_ref.shape[0]
     assert num_page_indices % max_num_seqs == 0
     pages_per_seq = num_page_indices // max_num_seqs
     q_dtype = ql_nope_hbm_ref.dtype
-    kv_dtype = cache_kv_c_hbm_ref.dtype
+    # Validate against the KV dtype.
+    kv_dtype = cache_kv_hbm_ref.dtype
     assert q_pe_hbm_ref.dtype == q_dtype
     assert o_hbm_ref.dtype == q_dtype
     assert get_dtype_packing(q_dtype) == q_packing
@@ -561,8 +555,8 @@ def _mla_ragged_paged_attention_kernel(
     def flash_attention(
         ql_nope,  # [actual_bq_sz * num_q_heads, lkv_dim]
         q_pe,  # [actual_bq_sz * num_q_heads, r_dim]
-        kv_c,  # [bkv_sz, lkv_dim]
-        k_pe,  # [bkv_sz, r_dim]
+        kv_c,  # [bkv_sz, lkv_dim] <- Correspond to data from bkvc_x2_ref
+        k_pe,  # [bkv_sz, r_dim] <- Correspond to data from bpe_x2_ref
         *,
         bq_idx,
         bkv_idx,
@@ -649,14 +643,9 @@ def _mla_ragged_paged_attention_kernel(
         sem = sems.at[0, bkv_sem_idx]
         bkvc_vmem_ref = bkvc_x2_ref.at[bkv_sem_idx]
         bkvpe_vmem_ref = bkpe_x2_ref.at[bkv_sem_idx]
-        reshaped_cache_kv_c_hbm_ref = cache_kv_c_hbm_ref.reshape(
+        reshaped_cache_hbm_ref = cache_kv_hbm_ref.reshape(
             total_num_pages * page_size_per_kv_packing,
-            *cache_kv_c_hbm_ref.shape[2:],
-        )
-        reshaped_cache_k_pe_hbm_ref = cache_k_pe_hbm_ref.reshape(
-            total_num_pages * page_size_per_kv_packing,
-            *cache_k_pe_hbm_ref.shape[2:],
+            *cache_kv_hbm_ref.shape[2:],
         )
         kv_len = kv_lens_ref[seq_idx]
         kv_len_start = bkv_idx * bkv_sz
@@ -684,22 +673,22 @@ def _mla_ragged_paged_attention_kernel(
                 kv_left_per_kv_packing - i * page_size_per_kv_packing,
             )
             _async_copy(
-                reshaped_cache_kv_c_hbm_ref.at[pl.ds(
+                reshaped_cache_hbm_ref.at[pl.ds(
                     page_indices_ref[page_indices_offset + i] *
                     page_size_per_kv_packing,
                     sz_per_kv_packing,
-                )],
+                ), ..., :nope_dim],
                 bkvc_vmem_ref.at[pl.ds(i * page_size_per_kv_packing,
                                        sz_per_kv_packing)],
                 sem,
                 wait,
             )
             _async_copy(
-                reshaped_cache_k_pe_hbm_ref.at[pl.ds(
+                reshaped_cache_hbm_ref.at[pl.ds(
                     page_indices_ref[page_indices_offset + i] *
                     page_size_per_kv_packing,
                     sz_per_kv_packing,
-                )],
+                ), ..., nope_dim:],
                 bkvpe_vmem_ref.at[pl.ds(i * page_size_per_kv_packing,
                                         sz_per_kv_packing)],
                 sem,
@@ -835,7 +824,6 @@ def _mla_ragged_paged_attention_kernel(
                                        jnp.zeros_like(concated_bkvc_vec))
         concated_bkvc_vec = pltpu.bitcast(concated_bkvc_vec.astype(repack_ty),
                                           kv_dtype)
         bkpe_ref = (bkpe_x2_ref.bitcast(jnp.uint32).at[bkv_sem_idx].reshape(
             bkv_sz_per_kv_packing, r_dim))
         bkpe_vec = bkpe_ref[...]
@@ -1082,17 +1070,16 @@ def prepare_outputs(
         "vmem_limit_bytes",
         "debug_mode",
     ),
-    donate_argnames=("cache_kv_c", "cache_k_pe"),
+    donate_argnames=("cache_kv"),
 )
 def mla_ragged_paged_attention(
     ql_nope: jax.Array,  # [max_num_tokens, actual_num_q_heads, actual_lkv_dim]
     q_pe: jax.Array,  # [max_num_tokens, actual_num_q_heads, actual_r_dim]
     new_kv_c: jax.Array,  # [max_num_tokens, actual_lkv_dim]
     new_k_pe: jax.Array,  # [max_num_tokens, actual_r_dim]
-    cache_kv_c: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, lkv_dim]
-    cache_k_pe: jax.
-    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, r_dim]
+    # TODO(gpolovets): Explore separating out into lkv & pe KV caches.
+    cache_kv: jax.
+    Array,  # [total_num_pages, page_size_per_kv_packing, kv_packing, align_to(lkv_dim, 128)]
     kv_lens: jax.Array,  # i32[max_num_seqs]
     page_indices: jax.Array,  # i32[max_num_seqs * pages_per_seq]
     cu_q_lens: jax.Array,  # i32[max_num_seqs + 1]
@@ -1124,8 +1111,7 @@ def mla_ragged_paged_attention(
     q_pe: concatenated all sequences' rope.
     new_kv_c: concatenated all sequences' kv_c values
     new_k_pe: concatenated all sequences' k_pe values
-    cache_kv_c: the current kv_c cache.
-    cache_k_pe: the current k_pe cache.
+    cache_kv: the current kv cache.
     kv_lens: the length of each sequence in the kv cache.
     page_indices: flattened page indices look-up table by (seq_id, page_id).
     cu_q_lens: the cumulative sum of the effective query lengths. Similar to
@@ -1159,8 +1145,7 @@ def mla_ragged_paged_attention(
         q_pe,
         new_kv_c,
         new_k_pe,
-        cache_kv_c,
-        cache_k_pe,
+        cache_kv,
         kv_lens,
         page_indices,
         cu_q_lens,
@@ -1177,11 +1162,10 @@ def mla_ragged_paged_attention(
     )
     # TODO(chengjiyao): fuse kv cache update into the kernel.
-    cache_kv_c, cache_k_pe = update_kv_cache(
+    cache_kv = update_kv_cache(
         new_kv_c,
         new_k_pe,
-        cache_kv_c,
-        cache_k_pe,
+        cache_kv,
         kv_lens,
         page_indices,
         cu_q_lens,
@@ -1202,7 +1186,7 @@ def mla_ragged_paged_attention(
     lkv_dim = new_kv_c.shape[-1]
     r_dim = new_k_pe.shape[-1]
-    _, page_size_per_kv_packing, kv_packing, _ = cache_kv_c.shape
+    _, page_size_per_kv_packing, kv_packing, _ = cache_kv.shape
     page_size = page_size_per_kv_packing * kv_packing
     _, num_q_heads_per_q_packing, q_packing, _ = ql_nope.shape
     max_num_seqs = kv_lens.shape[0]
@@ -1221,23 +1205,21 @@ def mla_ragged_paged_attention(
         pl.BlockSpec(memory_space=pltpu.HBM),
         pl.BlockSpec(memory_space=pltpu.HBM),
         pl.BlockSpec(memory_space=pltpu.HBM),
-        pl.BlockSpec(memory_space=pltpu.HBM),
     ]
     out_specs = [
         pl.BlockSpec(memory_space=pltpu.HBM),
         pl.BlockSpec(memory_space=pltpu.HBM),
-        pl.BlockSpec(memory_space=pltpu.HBM),
     ]
     bkvc_double_buf = pltpu.VMEM(
         (2, bkv_sz_per_kv_packing, kv_packing, lkv_dim),
-        cache_kv_c.dtype,
+        cache_kv.dtype,
     )
     bkpe_double_buf = pltpu.VMEM(
         (2, bkv_sz_per_kv_packing, kv_packing, r_dim),
-        cache_k_pe.dtype,
+        cache_kv.dtype,
     )
     bq_nope_double_buf = pltpu.VMEM(
@@ -1320,30 +1302,26 @@ def mla_ragged_paged_attention(
             ),
             out_shape=[
                 jax.ShapeDtypeStruct(shape=ql_nope.shape, dtype=ql_nope.dtype),
-                jax.ShapeDtypeStruct(shape=cache_kv_c.shape,
-                                     dtype=cache_kv_c.dtype),
-                jax.ShapeDtypeStruct(shape=cache_k_pe.shape,
-                                     dtype=cache_k_pe.dtype),
+                jax.ShapeDtypeStruct(shape=cache_kv.shape,
+                                     dtype=cache_kv.dtype),
             ],
             input_output_aliases={
                 7: 0,
                 11: 1,
-                12: 2
             },
             name=scope_name,
         ))
-    output, updated_kv_c, updated_k_pe = kernel(
+    output, updated_kv = kernel(
         *scalar_prefetches,
         ql_nope,
         q_pe,
         new_kv_c,
         new_k_pe,
-        cache_kv_c,
-        cache_k_pe,
+        cache_kv,
     )
     output = prepare_outputs(
         output, actual_num_q_heads,
         actual_lkv_dim)  # [max_num_tokens, actual_num_q_heads, actual_lkv_dim]
-    return output, updated_kv_c, updated_k_pe
+    return output, updated_kv

tpu_inference/kernels/quantized_matmul/kernel.py CHANGED Viewed

@@ -9,12 +9,58 @@ from jax._src import dtypes
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
+from tpu_inference.kernels.quantized_matmul import util
 from tpu_inference.kernels.quantized_matmul.tuned_block_sizes import (
     TunedValue, get_device_vmem_limit, get_tuned_block_sizes)
 from tpu_inference.kernels.quantized_matmul.util import (get_kernel_name,
                                                          next_multiple,
                                                          unfold_args)
+quantize_tensor = util.quantize_tensor
+def xla_quantized_matmul(
+    x: jax.Array,
+    w_q: jax.Array,
+    w_scale: jax.Array,
+    quantize_activation=True,
+) -> jax.Array:
+    """
+    Reference (pure JAX) implementation of the quantized matmul kernel below.
+    Args:
+        x:  Activation.
+        w_q: Weight quantized array. [n_output_features, n_input_features]
+        w_s: Weight quantization scale. [n_output_features]
+        mesh: Mesh to shard on.
+        weight_sharding: PartitionSpec for the weight tensor.
+    Returns:
+        Output of the quantized matmul.
+    """
+    if quantize_activation:
+        acc_dtype = jnp.float32
+        if quantize_activation and jnp.issubdtype(w_q.dtype, jnp.integer):
+            acc_dtype = jnp.int32
+        x_q, x_scale = quantize_tensor(x, w_q.dtype)
+        out = jax.lax.dot_general(
+            x_q,
+            w_q,
+            dimension_numbers=(((1, ), (1, )), ((), ())),
+            preferred_element_type=acc_dtype,
+        ).astype(jnp.float32)
+        out *= x_scale
+    else:
+        out = jax.lax.dot_general(
+            x,
+            w_q,
+            dimension_numbers=(((1, ), (1, )), ((), ())),
+            preferred_element_type=jnp.float32,
+        )
+    out *= jnp.expand_dims(w_scale, 0)
+    return out.astype(x.dtype)
 def quantize_array(
     x: jax.Array,  # [bs_block_size, in_block_size]
@@ -50,11 +96,20 @@ def get_vmem_limit(
     """Calculate VMEM limit for the kernel."""
     # Calculate in/out VMEM size.
-    x_size = batch_block_size * in_block_size * dtypes.bit_width(x_dtype)
-    x_abs_max_size = batch_block_size * dtypes.bit_width(scale_dtype)
-    w_q_size = out_block_size * in_block_size * dtypes.bit_width(w_q_dtype)
-    w_scale_size = out_block_size * dtypes.bit_width(scale_dtype)
-    out_size = batch_block_size * out_block_size * dtypes.bit_width(out_dtype)
+    x_size = (batch_block_size *
+              in_block_size * (dtypes.bit_width(x_dtype) if hasattr(
+                  dtypes, "bit_width") else dtypes.itemsize_bits(x_dtype)))
+    x_abs_max_size = (
+        batch_block_size * (dtypes.bit_width(scale_dtype) if hasattr(
+            dtypes, "bit_width") else dtypes.itemsize_bits(scale_dtype)))
+    w_q_size = (out_block_size *
+                in_block_size * (dtypes.bit_width(w_q_dtype) if hasattr(
+                    dtypes, "bit_width") else dtypes.itemsize_bits(w_q_dtype)))
+    w_scale_size = (out_block_size * (dtypes.bit_width(scale_dtype) if hasattr(
+        dtypes, "bit_width") else dtypes.itemsize_bits(scale_dtype)))
+    out_size = (batch_block_size *
+                out_block_size * (dtypes.bit_width(out_dtype) if hasattr(
+                    dtypes, "bit_width") else dtypes.itemsize_bits(out_dtype)))
     vmem_in_out = x_size + x_abs_max_size + w_q_size + w_scale_size + out_size
     vmem_in_out *= 2  # Account for compute and vreg spills.
@@ -68,9 +123,15 @@ def get_vmem_limit(
     vmem_in_out += out_size if (n_batch > 1 or n_out > 1) else 0
     # Calculate scratch VMEM size.
-    acc_size = batch_block_size * out_block_size * dtypes.bit_width(acc_dtype)
-    x_q_size = batch_block_size * in_block_size * dtypes.bit_width(x_q_dtype)
-    x_scale_size = batch_block_size * dtypes.bit_width(scale_dtype)
+    acc_size = (batch_block_size *
+                out_block_size * (dtypes.bit_width(acc_dtype) if hasattr(
+                    dtypes, "bit_width") else dtypes.itemsize_bits(acc_dtype)))
+    x_q_size = (batch_block_size *
+                in_block_size * (dtypes.bit_width(x_q_dtype) if hasattr(
+                    dtypes, "bit_width") else dtypes.itemsize_bits(x_q_dtype)))
+    x_scale_size = (
+        batch_block_size * (dtypes.bit_width(scale_dtype) if hasattr(
+            dtypes, "bit_width") else dtypes.itemsize_bits(scale_dtype)))
     vmem_scratch = acc_size if save_acc else 0
     vmem_scratch += x_q_size + x_scale_size if save_x_q else 0

tpu_inference/kernels/ragged_paged_attention/v2/kernel.py CHANGED Viewed

@@ -655,7 +655,8 @@ def cdiv(a, b):
 def get_dtype_packing(dtype):
-    bits = dtypes.bit_width(dtype)
+    bits = (dtypes.bit_width(dtype)
+            if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
     return 32 // bits

tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py CHANGED Viewed

@@ -200,7 +200,8 @@ def _prev_power_of_2(n: int) -> int:
 def _get_page_size_bytes(block_size: int, num_combined_kv_heads: int,
                          head_size: int, kv_cache_dtype) -> int:
     """Returns the size in bytes of one page of the KV cache."""
-    kv_cache_dtype_bit_size = dtypes.bit_width(kv_cache_dtype)
+    kv_cache_dtype_bit_size = (dtypes.bit_width(kv_cache_dtype) if hasattr(
+        dtypes, "bit_width") else dtypes.itemsize_bits(kv_cache_dtype))
     padded_head_size = _ceil_div(
         head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl