PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +88 -25
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +45 -15
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +41 -16
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tpu_inference/kernels/ragged_paged_attention/v3/util.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Utility functions for ragged paged attention."""
 import jax
 from jax._src import dtypes
@@ -13,7 +26,8 @@ def align_to(x, a):
 def get_dtype_bitwidth(dtype):
-    return dtypes.bit_width(dtype)
+    return (dtypes.bit_width(dtype)
+            if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
 def get_dtype_packing(dtype):

tpu_inference/layers/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/common/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/common/attention_interface.py CHANGED Viewed

@@ -1,10 +1,23 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import functools
 import math
 from typing import Any, Callable, Optional, Tuple
 import jax
 import jax.numpy as jnp
-from jax.experimental import shard_map
 from jax.experimental.pallas.ops.tpu.paged_attention import paged_attention
 from jax.experimental.pallas.ops.tpu.splash_attention import \
     splash_attention_kernel as splash
@@ -55,11 +68,11 @@ def sharded_flash_attention(
                                vmem_limit_bytes=vmem_limit_bytes)
     return jax.jit(
-        shard_map.shard_map(_flash_attention,
-                            mesh=mesh,
-                            in_specs=in_specs,
-                            out_specs=out_specs,
-                            check_rep=False))
+        jax.shard_map(_flash_attention,
+                      mesh=mesh,
+                      in_specs=in_specs,
+                      out_specs=out_specs,
+                      check_vma=False))
 def sharded_paged_attention(
@@ -94,12 +107,12 @@ def sharded_paged_attention(
         )
     return jax.jit(
-        shard_map.shard_map(
+        jax.shard_map(
             _paged_attention_fn,
             mesh=mesh,
             in_specs=in_specs,
             out_specs=out_specs,
-            check_rep=False,
+            check_vma=False,
         ))
@@ -257,7 +270,7 @@ def sharded_splash_attention(
     )
     out_specs = P("data", "model", None, None)
     return jax.jit(
-        shard_map.shard_map(
+        jax.shard_map(
             functools.partial(
                 apply_splash,
                 window_size=window_size,
@@ -267,7 +280,7 @@ def sharded_splash_attention(
             mesh=mesh,
             in_specs=in_specs,
             out_specs=out_specs,
-            check_rep=False,
+            check_vma=False,
         ))
@@ -308,13 +321,7 @@ def sharded_ragged_paged_attention(
     args = (q, k, v, kv_cache, kv_lens, page_indices, cu_q_lens, distribution)
     use_hd64 = q.shape[-1] == 64
-    func = ragged_paged_attention
-    if use_hd64:
-        func = functools.partial(ragged_paged_attention_hd64,
-                                 strict_sliding_window=False)
-    else:
-        func = ragged_paged_attention
+    func = ragged_paged_attention_hd64 if use_hd64 else ragged_paged_attention
     if attention_sink is not None:
         if not use_hd64:
@@ -334,12 +341,12 @@ def sharded_ragged_paged_attention(
             v_scale=v_scale,
         )
-    return shard_map.shard_map(
+    return jax.shard_map(
         _ragged_paged_attention,
         mesh=mesh,
         in_specs=in_specs,
         out_specs=out_specs,
-        check_rep=False,
+        check_vma=False,
     )(*args)

tpu_inference/layers/common/attention_metadata.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import functools
 from dataclasses import dataclass, field
 from typing import Any

tpu_inference/layers/common/quant_methods.py CHANGED Viewed

@@ -1,7 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 UNQUANTIZED = "unquantized"
 MXFP4 = "mxfp4"
 AWQ = "awq"
 COMPRESSED_TENSORS = "compressed-tensors"
+FP8 = "fp8"
 def get_tpu_quant_method(quant_method: str) -> str:

tpu_inference/layers/common/quantization.py ADDED Viewed

@@ -0,0 +1,270 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from typing import Tuple
+import jax
+import jax.numpy as jnp
+MXFP4_BLOCK_SIZE = 32
+def quantize_tensor_to_mxfp4_packed(
+    tensor: jax.Array,
+    axis: int | tuple = -1,
+) -> Tuple[jax.Array, jax.Array]:
+    """Quantize a tensor to mxfp4 and pack it into uint8."""
+    # Perform regular block quantization.
+    tensor_q, scale = quantize_tensor(
+        jnp.float4_e2m1fn,
+        tensor,
+        axis,
+        MXFP4_BLOCK_SIZE,
+    )
+    # last two e2m1 elements will be packed into a single uint8 element.
+    bitcast_shape = tensor_q.shape[:-1] + (-1, 2)
+    tensor_q = tensor_q.reshape(bitcast_shape)
+    tensor_q_packed = jax.lax.bitcast_convert_type(tensor_q, jnp.uint8)
+    # Since TPU does not have native support for e8m0, we convert scale into
+    # e8m0 manually and store it as uint8.
+    e8m0_finfo = jnp.finfo(jnp.float8_e8m0fnu)
+    _, scale_exp = jnp.frexp(scale)
+    # Subtract exponents by one since e8m0 has no decimal.
+    scale_exp -= 1
+    scale_exp = (scale_exp - e8m0_finfo.minexp).astype(jnp.uint8)
+    return tensor_q_packed, scale_exp
+def u8_unpack_e2m1(u8_packed_e2m1: jax.Array) -> jax.Array:
+    """Unpack e2m1 tensor packed into u8."""
+    assert u8_packed_e2m1.dtype == jnp.uint8
+    e2m1 = jax.lax.bitcast_convert_type(u8_packed_e2m1, jnp.float4_e2m1fn)
+    # bitcast creates one more dimension that splits 8 bits into two e2m1.
+    # we flatten them with the last dim.
+    return jnp.reshape(e2m1, e2m1.shape[:-2] + (-1, ))
+def e8m0_to_fp32(u8: jax.Array) -> jax.Array:
+    """Convert e8m0 (that was bitcasted to u8) into fp32"""
+    assert u8.dtype == jnp.uint8
+    e8_finfo = jnp.finfo(jnp.float8_e8m0fnu)
+    exponents = u8.astype(jnp.int32) + e8_finfo.minexp
+    ones = jnp.ones_like(u8, dtype=jnp.float32)
+    return jnp.ldexp(ones, exponents)
+def dequantize_tensor(
+    tensor_q: jax.Array,
+    scale: jax.Array,
+    axis: int | None | tuple = -1,
+    out_dtype: jnp.dtype = jnp.bfloat16,
+) -> jax.Array:
+    """Dequantize a quantized tensor
+    Args:
+        tensor_q: Quantized tensor.
+        scale: Quantization scale.
+        axis: The axis tensor was quantized. None denotes per-tensor.
+        out_dtype: Dtype of the output.
+    Returns:
+        Dequantized tensor_q.
+    """
+    if axis is None:
+        # Perform per-tensor quantization.
+        axis = [i for i in range(tensor_q.ndim)]
+    if isinstance(axis, int):
+        axis = [axis]
+    orig_shape = tensor_q.shape
+    if tensor_q.ndim == scale.ndim:
+        # Indicates the tensor was block quantized.
+        blocked_shape = [[i] for i in orig_shape]
+        for i in axis:
+            num_blocks = scale.shape[i]
+            if tensor_q.shape[i] % num_blocks:
+                raise ValueError(
+                    f"Unable to perform block dequantization. axis={i} of "
+                    f"{tensor_q.shape=} is not divisible by {num_blocks=}", )
+            block_size = tensor_q.shape[i] // num_blocks
+            blocked_shape[i] = (num_blocks, block_size)
+        # Convert all axis into positive values.
+        axis = sorted([(i + tensor_q.ndim) % tensor_q.ndim for i in axis])
+        # Shift axis by 1 since its original position is now occupied by
+        # num_blocks dim. Also, if n axes before an axis was also quantized,
+        # shift its position by n.
+        axis = [1 + n + i for n, i in enumerate(axis)]
+        # Flatten list of lists that contains (num_blocks, block).
+        blocked_shape = list(itertools.chain(*blocked_shape))
+        tensor_q = tensor_q.reshape(blocked_shape)
+    scale = jnp.expand_dims(scale, axis)
+    tensor = (tensor_q.astype(jnp.float32) * scale).astype(out_dtype)
+    return tensor.reshape(orig_shape)
+def dequantize_tensor_from_mxfp4_packed(
+    tensor_q: jax.Array,
+    scale: jax.Array,
+    axis: int | tuple = -1,
+    out_dtype: jnp.dtype = jnp.bfloat16,
+) -> jax.Array:
+    """Dequantize packed mxfp4 tensor.
+    Args:
+        tensor_q: fp4 tensor packed into uint8.
+        scale: e8m0 scale packed into uint8.
+        axis: The axis tensor was quantized.
+        out_dtype: Dtype of the output.
+    Returns:
+        Dequantized tensor_q.
+    """
+    tensor_e2m1 = u8_unpack_e2m1(tensor_q)
+    scale_fp32 = e8m0_to_fp32(scale)
+    return dequantize_tensor(
+        tensor_e2m1,
+        scale_fp32,
+        axis,
+        out_dtype,
+    )
+def quantize_tensor(
+    dtype: jnp.dtype,
+    tensor: jax.Array,
+    axis: int | tuple | None = -1,
+    block_size: int | None = None,
+    pad_tensor: bool = False,
+) -> tuple[jax.Array, jax.Array]:
+    """Quantize tensor.
+    Args:
+        dtype: dtype to perform quantization.
+        tensor: Unquantized tensor
+        axis: Axis to perform quantization. None denotes per-tensor.
+        block_size: Specify block quantization size.
+        pad_tensor: Whether to pad the axis along block size.
+    Returns:
+        Tensor quantized to dtype.
+    """
+    if axis is None:
+        # Perform per-tensor quantization.
+        axis = [i for i in range(tensor.ndim)]
+    if isinstance(axis, int):
+        axis = [axis]
+    orig_shape = tensor.shape
+    mask = jnp.ones_like(tensor, jnp.int32)
+    if block_size is not None:
+        if isinstance(block_size, int):
+            block_size = [block_size] * len(axis)
+        blocked_shape = [[i] for i in orig_shape]
+        pad_width = [[0, 0] for _ in range(tensor.ndim)]
+        for i, block in zip(axis, block_size):
+            num_blocks = (tensor.shape[i] + block - 1) // block
+            padding_size = num_blocks * block - tensor.shape[i]
+            if padding_size and not pad_tensor:
+                raise ValueError(
+                    f"Unable to perform block quantization. axis={i} of "
+                    f"{tensor.shape=} is not divisible by {block=}")
+            # Pad the tensor to align with block size.
+            pad_width[i][1] = padding_size
+            blocked_shape[i] = (num_blocks, block)
+        # In order to avoid padded values affecting scale value, we pad it
+        # using edge value of the tensor.
+        tensor = jnp.pad(tensor, pad_width, "edge")
+        mask = jnp.pad(mask, pad_width)
+        orig_shape = tensor.shape
+        # Convert all axis into positive values.
+        axis = sorted([i % tensor.ndim for i in axis])
+        # Shift axis by 1 since its original position is now occupied by
+        # num_blocks dim. Also, if n axes before an axis was also quantized,
+        # shift its position by n.
+        axis = [1 + n + i for n, i in enumerate(axis)]
+        # Flatten list of lists that contains (num_blocks, block).
+        blocked_shape = list(itertools.chain(*blocked_shape))
+        tensor = tensor.reshape(blocked_shape)
+    if jnp.issubdtype(dtype, jnp.integer):
+        dtype_info = jnp.iinfo(dtype)
+    else:
+        dtype_info = jnp.finfo(dtype)
+    dtype_max = float(dtype_info.max)
+    dtype_min = float(dtype_info.min)
+    abs_max = jnp.max(jnp.abs(tensor), axis=axis, keepdims=True)
+    scale = abs_max / dtype_max
+    tensor_q = jnp.clip(tensor / scale, dtype_min, dtype_max)
+    tensor_q = tensor_q.reshape(orig_shape)
+    tensor_q = tensor_q.astype(dtype)
+    # To avoid padded values affecting output of quantized matmul, we mask them
+    # out with 0s.
+    tensor_q = jnp.where(mask, tensor_q, 0)
+    scale = jnp.squeeze(scale, axis).astype(jnp.float32)
+    return tensor_q, scale
+def static_per_tensor_quantize_tensor(
+    dtype: jnp.dtype,
+    tensor: jax.Array,
+    scale: float,
+) -> jax.Array:
+    if jnp.issubdtype(dtype, jnp.integer):
+        dtype_info = jnp.iinfo(dtype)
+    else:
+        dtype_info = jnp.finfo(dtype)
+    dtype_max = float(dtype_info.max)
+    dtype_min = float(dtype_info.min)
+    return jnp.clip(tensor / scale, dtype_min, dtype_max).astype(dtype)
+def quantize_kv(
+    dtype: jnp.dtype,
+    key: jax.Array,
+    value: jax.Array,
+    k_scale: float,
+    v_scale: float,
+) -> Tuple[jax.Array, jax.Array]:
+    """Static quantize key and value tensors."""
+    key = static_per_tensor_quantize_tensor(dtype, key, k_scale)
+    value = static_per_tensor_quantize_tensor(dtype, value, v_scale)
+    return key, value

tpu_inference/layers/common/sharding.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import json
 import math
 from dataclasses import asdict, dataclass
@@ -26,7 +40,7 @@ class ShardingAxisNameBase:
     MLP_TENSOR = ('attn_dp', 'model', 'expert')
     MOE_TENSOR = ('attn_dp', 'model')
     EXPERT = ('attn_dp', 'expert', 'model')
-    VOCAB = ('expert', 'model')
+    VOCAB = ('expert', 'attn_dp', 'model')
 class ShardingAxisName2D:
@@ -119,10 +133,19 @@ class ShardingConfigManager:
                                                     False)
         if enable_dp_attention:
             # Replicate attention layer when num_kv_heads < TP
-            num_kv_heads = vllm_config.model_config.get_total_num_kv_heads()
+            num_kv_heads = 1 if vllm_config.model_config.use_mla else vllm_config.model_config.get_total_num_kv_heads(
+            )
+            cache_dtype = vllm_config.cache_config.cache_dtype
+            if cache_dtype == 'auto':
+                cache_dtype = vllm_config.model_config.dtype
             kv_dtype = utils.get_jax_dtype_from_str_dtype(
-                vllm_config.cache_config.cache_dtype) or jnp.bfloat16
+                cache_dtype) or jnp.bfloat16
             packing = 4 // jnp.dtype(kv_dtype).itemsize
+            # The default head dim is 128 but 64 is also supported as a special case.
+            if vllm_config.model_config.get_head_size() == 64:
+                packing *= 2
             # When num_kv_heads * 2 / packing < TP, tensor parallelism would
             # duplicate KV heads across devices, wasting kv cache memory.
             # Use attention DP instead to reduce per-device num_kv_heads and
@@ -168,8 +191,8 @@ class ShardingConfigManager:
         if sharding_strategy.attention_data_parallelism > 1:
             if not envs.NEW_MODEL_DESIGN:
                 raise ValueError(
-                    "Must run Attention DP with NEW_MODEL_DESIGN enabled. Please set the "
-                    "NEW_MODEL_DESIGN=True.")
+                    "Must run Attention DP with NEW_MODEL_DESIGN enabled. Please set "
+                    "NEW_MODEL_DESIGN=True")
     @property
     def total_dp_size(self) -> int:

tpu_inference/layers/jax/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/jax/attention/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/jax/attention/attention.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from dataclasses import InitVar, dataclass
 from typing import Any, Tuple
@@ -5,7 +19,6 @@ import jax
 import jax.numpy as jnp
 from flax import nnx
 from flax.typing import Sharding
-from jax.experimental import shard_map
 from jax.sharding import Mesh
 from jax.sharding import PartitionSpec as P
@@ -13,6 +26,7 @@ from tpu_inference import utils
 from tpu_inference.kernels.ragged_paged_attention.v3.kernel import \
     ragged_paged_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
 from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.base import create_param
 from tpu_inference.layers.jax.rope_interface import apply_rope
@@ -149,9 +163,8 @@ class Attention(nnx.Module):
             # q_scale = self._q_scale
             k_scale = self._k_scale
             v_scale = self._v_scale
-            k_SKH, v_SKH = utils.quantize_kv(k_SKH, v_SKH,
-                                             self.kv_cache_quantized_dtype,
-                                             k_scale, v_scale)
+            k_SKH, v_SKH = quantize_kv(self.kv_cache_quantized_dtype, k_SKH,
+                                       v_SKH, k_scale, v_scale)
         with jax.named_scope("attn_op"):
             new_kv_cache, outputs_TNH = self.attention(
@@ -236,12 +249,12 @@ class Attention(nnx.Module):
             )
         output_TNH, kv_cache = jax.jit(
-            shard_map.shard_map(
+            jax.shard_map(
                 _ragged_paged_attention,
                 mesh=mesh,
                 in_specs=in_specs,
                 out_specs=out_specs,
-                check_rep=False,
+                check_vma=False,
             ))(
                 q_TNH,
                 k_SKH,

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl