PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} RENAMED Viewed

@@ -1,7 +1,18 @@
-import os
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import jax
-import jax.numpy as jnp
 import torch
 import torchax
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
@@ -9,6 +20,7 @@ from torch.nn import Parameter
 from torch.utils import _pytree as pytree
 from torchax.interop import jax_view, torch_view
 from torchax.ops.mappings import t2j
+from vllm import envs as vllm_envs
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLoRA,
@@ -20,18 +32,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from tpu_inference import envs
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
+from tpu_inference.utils import to_jax_dtype
 P = PartitionSpec
 logger = init_logger(__name__)
-TORCH_TO_JAX_DTYPE_MAP = {
-    torch.float32: jnp.float32,
-    torch.float16: jnp.float16,
-    torch.bfloat16: jnp.bfloat16,
-}
 def shard_model_to_tpu(model: torch.nn.Module,
                        mesh: Mesh) -> dict[str, torchax.torch.Tensor]:
@@ -88,10 +96,9 @@ def _tensor_is_in_cpu(tensor: torch.tensor) -> bool:
 def _convert_to_torchax_and_shard(tensor: torch.Tensor,
                                   sharding: NamedSharding) -> torch.Tensor:
-    if os.getenv("VLLM_TPU_USING_PATHWAYS", False) and isinstance(
-            tensor, torch.Tensor):
+    if vllm_envs.VLLM_TPU_USING_PATHWAYS and isinstance(tensor, torch.Tensor):
         np_tensor = tensor.detach().cpu().to(torch.float32).numpy()
-        dtype = TORCH_TO_JAX_DTYPE_MAP.get(tensor.dtype, jnp.float32)
+        dtype = to_jax_dtype(tensor.dtype)
         return torch_view(jax.device_put(np_tensor, sharding).astype(dtype))
     else:
         if isinstance(tensor, torchax.tensor.Tensor):
@@ -109,7 +116,8 @@ def _shard_tensor_to_tpu_replicated(tensor: torch.Tensor,
 def _shard_vocab_parallel_embedding(layer: VocabParallelEmbedding,
                                     mesh: Mesh) -> None:
     weight = _convert_to_torchax_and_shard(
-        layer.weight, NamedSharding(mesh, P('model', None)))
+        layer.weight, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR,
+                                            None)))
     layer.weight = Parameter(weight, requires_grad=False)
@@ -118,11 +126,12 @@ def _shard_lm_head(layer: ParallelLMHead, mesh: Mesh):
     # if that config is set, then we should not create new weights but reuse the
     # weight from VocabParallelEmbedding
     weight = _convert_to_torchax_and_shard(
-        layer.weight, NamedSharding(mesh, P('model', None)))
+        layer.weight, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR,
+                                            None)))
     layer.weight = Parameter(weight, requires_grad=False)
     if layer.bias is not None:
-        bias = _convert_to_torchax_and_shard(layer.bias,
-                                             NamedSharding(mesh, P('model')))
+        bias = _convert_to_torchax_and_shard(
+            layer.bias, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR)))
         layer.bias = Parameter(bias, requires_grad=False)

tpu_inference/layers/vllm/process_weights/fused_moe_weights.py ADDED Viewed

@@ -0,0 +1,369 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, fields
+import jax
+import jax.numpy as jnp
+from jax.experimental.layout import Format, Layout, with_layout_constraint
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torchax.tensor import Tensor
+from tpu_inference.layers.common.quantization import quantize_tensor
+from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.common.utils import \
+    reorder_concatenated_tensor_for_sharding
+from tpu_inference.layers.vllm.fused_moe import FusedMoEBackend
+from tpu_inference.utils import align_to
+P = PartitionSpec
+@jax.tree_util.register_dataclass
+@dataclass
+class FusedMoEWeights:
+    """Fused moe weights. weights can be either jax or torchax array."""
+    w13_weight: jax.Array | Tensor
+    w13_weight_scale: jax.Array | Tensor | None
+    w13_bias: jax.Array | Tensor | None
+    w2_weight: jax.Array | Tensor
+    w2_weight_scale: jax.Array | Tensor | None
+    w2_bias: jax.Array | Tensor | None
+def quantize_moe_weights(
+    weights: FusedMoEWeights,
+    dtype: jnp.dtype,
+    block_size: int | None,
+) -> FusedMoEWeights:
+    """Quantize fused moe weights into a given dtype and block size.
+    Args:
+        weights: fused moe weights.
+        dtype: dtype to perform quantization.
+        block_size: Specify block quantization size. If non, use per-channel
+            quantization. If contracting dim is not divisible by block size,
+            the dim will be automatically padded and corresponding dim on bias
+            and the other weight (w13_weight <-> w2_weight) is also padded.
+    Returns:
+        Quantized fused moe weights that may have also been padded.
+    """
+    # If scale is present, it means the weights are already quantized.
+    # Ensure that weights are not quantized by checking if scales are None.
+    assert weights.w13_weight_scale is None
+    assert weights.w2_weight_scale is None
+    w13_weight = weights.w13_weight
+    w2_weight = weights.w2_weight
+    if block_size is None:
+        # Use per-channel quantizaiton.
+        w13_block_size = w13_weight.shape[-1]
+        w2_block_size = w2_weight.shape[-1]
+    else:
+        w13_block_size = w2_block_size = block_size
+    _, orig_hidden_size, orig_intermediate_size = w2_weight.shape
+    w13_weight, w13_weight_scale = quantize_tensor(dtype, w13_weight, 2,
+                                                   w13_block_size, True)
+    w2_weight, w2_weight_scale = quantize_tensor(dtype, w2_weight, 2,
+                                                 w2_block_size, True)
+    intermediate_size = w2_weight.shape[-1]
+    hidden_size = w13_weight.shape[-1]
+    # Dims may have been padded to align with subchannel size during
+    # quantization. We pad the corresponding dim on other weight.
+    # NOTE: We perform padding after quantization as padding value can
+    # affect quantization numerics.
+    w13_pad_widths = [[0, 0] for _ in range(3)]
+    w13_pad_widths[1][1] = 2 * (intermediate_size - orig_intermediate_size)
+    w2_pad_widths = [[0, 0] for _ in range(3)]
+    w2_pad_widths[1][1] = hidden_size - orig_hidden_size
+    weights.w13_weight = jnp.pad(w13_weight, w13_pad_widths)
+    weights.w13_weight_scale = jnp.pad(w13_weight_scale, w13_pad_widths)
+    weights.w2_weight = jnp.pad(w2_weight, w2_pad_widths)
+    weights.w2_weight_scale = jnp.pad(w2_weight_scale, w2_pad_widths)
+    if (w13_bias := weights.w13_bias) is not None:
+        weights.w13_bias = jnp.pad(w13_bias, w13_pad_widths[:2])
+    if (w2_bias := weights.w2_bias) is not None:
+        weights.w2_bias = jnp.pad(w2_bias, w2_pad_widths[:2])
+    return weights
+def process_moe_weights(
+    weights: FusedMoEWeights,
+    moe_backend: FusedMoEBackend,
+    w13_reorder_size: int | None = None,
+    w13_interleave: bool = False,
+) -> FusedMoEWeights:
+    """Process fused moe weights to a layout that moe backend expects.
+    Args:
+        weights: fused moe weights.
+        moe_backend: backend type the weights should be processed for.
+        w13_reorder_size: only used when backend type is GMM_TP. in order to
+            eliminate collective operations when using tensor parallelism,
+            group w13_weight into w13_reorder_size number of chuncks where each
+            chunk stores both w1 and w3 weights.
+        w13_interleave: used when loaded w13_weight is stored in interleaved
+            pattern where even index element is w1 and odd index element is w3.
+            we uninterleave so that first half is w1 and second half is w3.
+    Returns:
+        MoE weights that are processed for specified backend.
+    """
+    w13_weight = weights.w13_weight
+    w13_weight_scale = weights.w13_weight_scale
+    w13_bias = weights.w13_bias
+    w2_weight = weights.w2_weight
+    w2_weight_scale = weights.w2_weight_scale
+    w2_bias = weights.w2_bias
+    num_experts, hidden_size, intermediate_size = w2_weight.shape
+    if w13_interleave:
+        w1_weight = w13_weight[:, ::2, :]
+        w3_weight = w13_weight[:, 1::2, :]
+        w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+        if w13_weight_scale is not None:
+            w1_weight_scale = w13_weight_scale[:, ::2, :]
+            w3_weight_scale = w13_weight_scale[:, 1::2, :]
+            w13_weight_scale = jnp.concat([w1_weight_scale, w3_weight_scale],
+                                          axis=1)
+        if w13_bias is not None:
+            w1_bias = w13_bias[:, ::2]
+            w3_bias = w13_bias[:, 1::2]
+            w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
+    if w13_weight_scale is not None:
+        w13_weight_scale = w13_weight_scale.astype(jnp.float32)
+    if w2_weight_scale is not None:
+        w2_weight_scale = w2_weight_scale.astype(jnp.float32)
+    if w13_bias is not None:
+        w13_bias = w13_bias.astype(jnp.float32)
+    if w2_bias is not None:
+        w2_bias = w2_bias.astype(jnp.float32)
+    match moe_backend:
+        case FusedMoEBackend.FUSED_MOE:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            # Fused moe kernel expects dims to be multiple of 256.
+            pad_width_intermediate_size = align_to(intermediate_size,
+                                                   256) - intermediate_size
+            pad_width_hidden_size = align_to(hidden_size, 256) - hidden_size
+            w13_weight = w13_weight.reshape(
+                num_experts,
+                2,
+                intermediate_size,
+                hidden_size,
+            )
+            # Transpose non-constracting dim to right most dim
+            w13_weight = jnp.swapaxes(w13_weight, 2, 3)
+            w2_weight = jnp.swapaxes(w2_weight, 1, 2)
+            # Workaround for JAX error "must have valid byte strides"
+            w13_weight = with_layout_constraint(w13_weight, Layout(
+                (0, 1, 2, 3)))
+            w2_weight = with_layout_constraint(w2_weight, Layout((0, 1, 2)))
+            w13_weight = jnp.pad(
+                w13_weight,
+                ((0, 0), (0, 0), (0, pad_width_hidden_size),
+                 (0, pad_width_intermediate_size)),
+            )
+            w2_weight = jnp.pad(
+                w2_weight,
+                ((0, 0), (0, pad_width_intermediate_size),
+                 (0, pad_width_hidden_size)),
+            )
+            if w13_weight_scale is not None:
+                w13_weight_scale = w13_weight_scale.reshape(
+                    num_experts, 2, intermediate_size, 1, -1)
+                w13_weight_scale = jnp.swapaxes(w13_weight_scale, 2, 4)
+                w13_weight_scale = jnp.pad(
+                    w13_weight_scale,
+                    ((0, 0), (0, 0), (0, pad_width_hidden_size), (0, 0),
+                     (0, pad_width_intermediate_size)),
+                )
+            if w2_weight_scale is not None:
+                w2_weight_scale = w2_weight_scale.reshape(
+                    num_experts, hidden_size, 1, -1)
+                w2_weight_scale = jnp.swapaxes(w2_weight_scale, 1, 3)
+                w2_weight_scale = jnp.pad(
+                    w2_weight_scale,
+                    ((0, 0), (0, pad_width_intermediate_size), (0, 0),
+                     (0, pad_width_hidden_size)),
+                )
+            if w13_bias is not None:
+                w13_bias = w13_bias.reshape(num_experts, 2, 1,
+                                            intermediate_size)
+                w13_bias = jnp.pad(
+                    w13_bias,
+                    ((0, 0), (0, 0), (0, 0), (0, pad_width_intermediate_size)),
+                )
+            if w2_bias is not None:
+                w2_bias = w2_bias.reshape(num_experts, 1, hidden_size)
+                w2_bias = jnp.pad(
+                    w2_bias,
+                    ((0, 0), (0, 0), (0, pad_width_hidden_size)),
+                )
+        case FusedMoEBackend.GMM_EP | FusedMoEBackend.GMM_TP:
+            if w13_weight_scale is not None:
+                w13_weight_scale = jnp.swapaxes(w13_weight_scale, 1, 2)
+                w13_weight_scale = jnp.expand_dims(w13_weight_scale, 2)
+            if w2_weight_scale is not None:
+                w2_weight_scale = jnp.swapaxes(w2_weight_scale, 1, 2)
+                w2_weight_scale = jnp.expand_dims(w2_weight_scale, 2)
+            if w13_bias is not None:
+                w13_bias = jnp.expand_dims(w13_bias, 1)
+            if w2_bias is not None:
+                w2_bias = jnp.expand_dims(w2_bias, 1)
+            if moe_backend == FusedMoEBackend.GMM_TP:
+                assert w13_reorder_size is not None
+                assert intermediate_size % w13_reorder_size == 0
+                output_sizes = [intermediate_size, intermediate_size]
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight,
+                    output_sizes,
+                    w13_reorder_size,
+                    dim=1,
+                )
+                if w13_weight_scale is not None:
+                    w13_weight_scale = reorder_concatenated_tensor_for_sharding(
+                        w13_weight_scale,
+                        output_sizes,
+                        w13_reorder_size,
+                        dim=3,
+                    )
+                if w13_bias is not None:
+                    w13_bias = reorder_concatenated_tensor_for_sharding(
+                        w13_bias,
+                        output_sizes,
+                        w13_reorder_size,
+                        dim=2,
+                    )
+    return FusedMoEWeights(
+        w13_weight=w13_weight,
+        w13_weight_scale=w13_weight_scale,
+        w13_bias=w13_bias,
+        w2_weight=w2_weight,
+        w2_weight_scale=w2_weight_scale,
+        w2_bias=w2_bias,
+    )
+def shard_moe_weights(
+    weights: FusedMoEWeights,
+    moe_backend: FusedMoEBackend,
+    mesh: Mesh,
+) -> FusedMoEWeights:
+    match moe_backend:
+        case FusedMoEBackend.FUSED_MOE | FusedMoEBackend.GMM_EP:
+            ep_sharding = NamedSharding(mesh, P(ShardingAxisName.EXPERT))
+            weight_shardings = FusedMoEWeights(
+                w13_weight=ep_sharding,
+                w13_weight_scale=ep_sharding,
+                w13_bias=ep_sharding,
+                w2_weight=ep_sharding,
+                w2_weight_scale=ep_sharding,
+                w2_bias=ep_sharding,
+            )
+        case FusedMoEBackend.GMM_TP:
+            # When using per-channel, in_dim // block_size == 1. This means we
+            # are unable to shard w2_weight_scale along 1st dim. Therefore, we
+            # fully replicate it instead.
+            if (weights.w2_weight_scale is not None
+                    and weights.w2_weight_scale.shape[1] == 1):
+                w2_weight_scale_p_spec = P()
+            else:
+                w2_weight_scale_p_spec = P(None, ShardingAxisName.MLP_TENSOR)
+            weight_shardings = FusedMoEWeights(
+                w13_weight=NamedSharding(
+                    mesh,
+                    P(None, ShardingAxisName.MLP_TENSOR, None),
+                ),  # (num_experts, out_dim, in_dim)
+                w13_weight_scale=NamedSharding(
+                    mesh,
+                    P(None, None, None, ShardingAxisName.MLP_TENSOR),
+                ),  # (num_experts, in_dim // block_size, 1, out_dim)
+                w13_bias=NamedSharding(
+                    mesh,
+                    P(None, None, ShardingAxisName.MLP_TENSOR),
+                ),  # (num_experts, 1, out_dim)
+                w2_weight=NamedSharding(
+                    mesh,
+                    P(None, None, ShardingAxisName.MLP_TENSOR),
+                ),  # (num_experts, out_dim, in_dim)
+                w2_weight_scale=NamedSharding(
+                    mesh, w2_weight_scale_p_spec
+                ),  # (num_experts, in_dim // block_size, 1, out_dim)
+                w2_bias=NamedSharding(
+                    mesh,
+                    P(None, None, None),
+                ),  # (num_experts, 1, out_dim)
+            )
+    match moe_backend:
+        case FusedMoEBackend.FUSED_MOE:
+            weight_layouts = FusedMoEWeights(
+                w13_weight=Layout((0, 1, 2, 3)),
+                w13_weight_scale=Layout((0, 1, 2, 3, 4)),
+                w13_bias=Layout((0, 1, 2, 3)),
+                w2_weight=Layout((0, 1, 2)),
+                w2_weight_scale=Layout((0, 1, 2, 3)),
+                w2_bias=Layout((0, 1, 2)),
+            )
+        case FusedMoEBackend.GMM_TP | FusedMoEBackend.GMM_EP:
+            weight_layouts = FusedMoEWeights(
+                w13_weight=Layout((0, 1, 2)),
+                w13_weight_scale=Layout((0, 1, 2, 3)),
+                w13_bias=Layout((0, 1, 2)),
+                w2_weight=Layout((0, 1, 2)),
+                w2_weight_scale=Layout((0, 1, 2, 3)),
+                w2_bias=Layout((0, 1, 2)),
+            )
+    for field in fields(FusedMoEWeights):
+        key = field.name
+        if (weight := getattr(weights, key, None)) is not None:
+            layout = getattr(weight_layouts, key)
+            sharding = getattr(weight_shardings, key)
+            weight = jax.device_put(weight, Format(layout, sharding))
+            setattr(weights, key, weight)
+    return weights

tpu_inference/layers/vllm/process_weights/linear_weights.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, fields
+import jax
+import torch
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torch.nn import ParameterList
+from torch.nn.parameter import Parameter
+from torchax.tensor import Tensor
+from tpu_inference.layers.common.utils import \
+    reorder_concatenated_tensor_for_sharding
+from tpu_inference.logger import init_logger
+P = PartitionSpec
+logger = init_logger(__name__)
+@jax.tree_util.register_dataclass
+@dataclass
+class LinearWeights:
+    weight: jax.Array | Tensor | list[jax.Array | Tensor]
+    weight_scale: jax.Array | Tensor | list[jax.Array | Tensor] | None
+    zero_point: jax.Array | Tensor | list[jax.Array | Tensor] | None
+    bias: jax.Array | Tensor | list[jax.Array | Tensor] | None
+MODEL_MATMUL_FUSION_TRUTH_TABLE = {
+    ("Qwen/Qwen2.5-7B-Instruct", 1024, 1, "QKVParallelLinear"):
+    True,
+    ("Qwen/Qwen2.5-7B-Instruct", 1024, 1, "MergedColumnParallelLinear"):
+    False,
+    ("Qwen/Qwen2.5-7B-Instruct", 2048, 1, "QKVParallelLinear"):
+    False,
+    ("Qwen/Qwen2.5-7B-Instruct", 2048, 1, "MergedColumnParallelLinear"):
+    False,
+    ("meta-llama/Llama-3.1-8B-Instruct", 1024, 1, "QKVParallelLinear"):
+    False,
+    ("meta-llama/Llama-3.1-8B-Instruct", 1024, 1, "MergedColumnParallelLinear"):
+    False,
+    ("meta-llama/Llama-3.1-8B-Instruct", 2048, 1, "QKVParallelLinear"):
+    False,
+    ("meta-llama/Llama-3.1-8B-Instruct", 2048, 1, "MergedColumnParallelLinear"):
+    False,
+    ("RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", 1024, 1, "QKVParallelLinear"):
+    False,
+    ("RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", 1024, 1, "MergedColumnParallelLinear"):
+    False,
+    ("RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", 2048, 1, "QKVParallelLinear"):
+    False,
+    ("RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", 2048, 1, "MergedColumnParallelLinear"):
+    False,
+}
+def to_parameter_list(tensor: list[torch.Tensor]):
+    tensor = [Parameter(t, requires_grad=False) for t in tensor]
+    return ParameterList(tensor)
+def get_model_matmul_fusion_assignment(model_name: str, batch_size: int,
+                                       tp_size: int, layer_name: str):
+    key = (model_name, batch_size, tp_size, layer_name)
+    return MODEL_MATMUL_FUSION_TRUTH_TABLE.get(key, True)
+def process_lienar_weights(
+    weights: LinearWeights,
+    fused: bool = False,
+    output_sizes: list[int] | None = None,
+    reorder_size: int | None = None,
+    transposed: bool = True,
+    per_tensor: bool = False,
+) -> LinearWeights:
+    weight = weights.weight
+    weight_scale = weights.weight_scale
+    zero_point = weights.zero_point
+    bias = weights.bias
+    dim = 0 if transposed else -1
+    if output_sizes is None:
+        output_sizes = [weight.shape[dim]]
+    if fused:
+        assert reorder_size is not None
+        weight = reorder_concatenated_tensor_for_sharding(
+            weight, output_sizes, reorder_size, dim)
+        if weight_scale is not None and not per_tensor:
+            weight_scale = reorder_concatenated_tensor_for_sharding(
+                weight_scale, output_sizes, reorder_size, dim)
+        if zero_point is not None:
+            zero_point = reorder_concatenated_tensor_for_sharding(
+                zero_point, output_sizes, reorder_size, dim)
+        if bias is not None:
+            bias = reorder_concatenated_tensor_for_sharding(
+                bias, output_sizes, reorder_size, dim)
+    else:
+        def slice_tensor(tensor):
+            tensors = []
+            start = 0
+            for size in output_sizes:
+                end = start + size
+                tensor_split = jax.lax.slice_in_dim(tensor,
+                                                    start,
+                                                    end,
+                                                    axis=dim)
+                tensors.append(tensor_split)
+                start = end
+            return tensors
+        weight = slice_tensor(weight)
+        if weight_scale is not None and not per_tensor:
+            weight_scale = slice_tensor(weight_scale)
+        if zero_point is not None:
+            zero_point = slice_tensor(zero_point)
+        if bias is not None:
+            bias = slice_tensor(bias)
+    return LinearWeights(
+        weight=weight,
+        weight_scale=weight_scale,
+        zero_point=zero_point,
+        bias=bias,
+    )
+def shard_linear_weights(
+    weights: LinearWeights,
+    mesh: Mesh,
+    weight_p_spec: PartitionSpec,
+    bias_p_spec: PartitionSpec,
+    transposed: bool = True,
+    per_tensor: bool = False,
+) -> LinearWeights:
+    if not transposed:
+        # By defualt, we use transposed weights. If it is not transposed,
+        # we need to transpose the sharding as well.
+        weight_p_spec = PartitionSpec(*weight_p_spec[::-1])
+        bias_p_spec = PartitionSpec(weight_p_spec[0])
+    weight_sharding = NamedSharding(mesh, weight_p_spec)
+    bias_sharding = NamedSharding(mesh, bias_p_spec)
+    weight_shardings = LinearWeights(
+        weight=weight_sharding,
+        weight_scale=NamedSharding(mesh, P()) if per_tensor else bias_sharding,
+        zero_point=bias_sharding,
+        bias=bias_sharding,
+    )
+    for field in fields(LinearWeights):
+        key = field.name
+        if (weight := getattr(weights, key, None)) is not None:
+            sharding = getattr(weight_shardings, key)
+            weight = jax.device_put(weight, sharding)
+            setattr(weights, key, weight)
+    return weights

tpu_inference/layers/vllm/quantization/__init__.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 from jax.sharding import Mesh
@@ -7,9 +21,10 @@ from vllm.model_executor.layers.quantization.base_config import \
 from tpu_inference.layers.common import quant_methods
 from tpu_inference.layers.vllm.quantization.awq import VllmAWQConfig
-from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors import \
-    VllmCompressedTensorsConfig  # noqa: E501
+    VllmCompressedTensorsConfig
+from tpu_inference.layers.vllm.quantization.configs import VllmQuantConfig
+from tpu_inference.layers.vllm.quantization.fp8 import VllmFp8Config
 from tpu_inference.layers.vllm.quantization.mxfp4 import VllmMxfp4Config
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedConfig
@@ -23,6 +38,7 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
         None: VllmUnquantizedConfig,
         quant_methods.COMPRESSED_TENSORS: VllmCompressedTensorsConfig,
         quant_methods.AWQ: VllmAWQConfig,
+        quant_methods.FP8: VllmFp8Config,
         quant_methods.MXFP4: VllmMxfp4Config,
     }
     if model_config.quantization not in method_to_config:
@@ -30,7 +46,7 @@ def get_tpu_quantization_config(vllm_config: VllmConfig,
             f"{model_config.quantization} quantization method not supported."
             f" Supported methods are {method_to_config.keys()}")
     quant_config = method_to_config[model_config.quantization]
-    assert issubclass(quant_config, JaxCommonConfig)
+    assert issubclass(quant_config, VllmQuantConfig)
     quant_config.set_configs(vllm_config, mesh)
     model_config.quantization = quant_methods.get_tpu_quant_method(

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl