PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tpu_inference/layers/vllm/quantization/unquantized.py ADDED Viewed

@@ -0,0 +1,416 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional, Union
+import jax
+import jax.numpy as jnp
+import torch
+from jax.experimental.layout import Format, Layout
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torch.nn.parameter import Parameter
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.attention.layer import Attention
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, FusedMoEConfig, UnquantizedFusedMoEMethod)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from tpu_inference import envs
+from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
+from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
+                                                       get_tpu_quant_method)
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func
+from tpu_inference.layers.vllm.linear_common import (
+    reorder_concatenated_tensor_for_sharding,
+    slice_sharded_tensor_for_concatenation, torch_to_jax_param)
+from tpu_inference.layers.vllm.quantization.common import (
+    JaxCommonConfig, JaxCommonLinearConfig)
+P = PartitionSpec
+logger = init_logger(__name__)
+def align_to(a, b):
+    return (a + b - 1) // b * b
+@register_quantization_config(get_tpu_quant_method(UNQUANTIZED))
+class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
+    @classmethod
+    def get_name(cls) -> str:
+        return UNQUANTIZED
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 0  # Always supported
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []  # No extra configs required.
+    @classmethod
+    def from_config(cls, _: dict[str, Any]) -> "VllmUnquantizedConfig":
+        return cls()
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional[QuantizeMethodBase]:
+        if isinstance(layer, LinearBase):
+            linear_config = self.get_linear_config(layer)
+            return VllmUnquantizedLinearMethod(linear_config)
+        if isinstance(layer, FusedMoE):
+            moe_config = self.get_moe_config(layer)
+            return VllmUnquantizedFusedMoEMethod(moe_config, self.mesh)
+        if isinstance(layer, Attention):
+            return None
+        return None
+class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
+    def __init__(self, jax_config: JaxCommonLinearConfig):
+        self.jax_config = jax_config
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = torch_to_jax_param(
+            layer.weight,
+            NamedSharding(self.jax_config.mesh,
+                          self.jax_config.weight_sharding),
+            self.jax_config.output_sizes,
+            self.jax_config.n_shards,
+            self.jax_config.fuse_matmuls,
+        )
+        delattr(layer, "weight")
+        layer.weight = weight
+        if layer.bias is not None and not layer.skip_bias_add:
+            if layer.return_bias:
+                logger.warning_once("Bias might return incorrect value.")
+            bias = torch_to_jax_param(
+                layer.bias,
+                NamedSharding(self.jax_config.mesh,
+                              self.jax_config.bias_sharding),
+                self.jax_config.output_sizes,
+                self.jax_config.n_shards,
+                self.jax_config.fuse_matmuls,
+            )
+            delattr(layer, "bias")
+            layer.bias = bias
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert isinstance(layer, LinearBase)
+        with jax.named_scope(layer._get_name()):
+            if in_sharding := self.jax_config.get_input_sharding(x):
+                x.shard_(NamedSharding(self.jax_config.mesh, in_sharding))
+            if self.jax_config.fuse_matmuls:
+                out = self._apply_fused(layer, x, bias)
+            else:
+                out = self._apply_split(layer, x, bias)
+            if out_sharding := self.jax_config.get_output_sharding(out):
+                out.shard_(NamedSharding(self.jax_config.mesh, out_sharding))
+        return out
+    def _apply_fused(self,
+                     layer: torch.nn.Module,
+                     x: torch.Tensor,
+                     bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x_jax = jax_view(x)
+        weight_jax = jax_view(layer.weight)
+        outs = jnp.einsum("mn,pn->mp", x_jax, weight_jax)
+        if bias is not None and not layer.skip_bias_add:
+            outs += bias.jax()
+        outs = slice_sharded_tensor_for_concatenation(
+            outs, self.jax_config.output_sizes, self.jax_config.n_shards)
+        out = jnp.concatenate(outs, axis=-1)
+        return torch_view(out)
+    def _apply_split(self,
+                     layer: torch.nn.Module,
+                     x: torch.Tensor,
+                     bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert isinstance(layer.weight, torch.nn.ParameterList)
+        x_jax = x.jax()
+        outs = []
+        for i, weight in enumerate(layer.weight):
+            weight_jax = jax_view(weight)
+            out = jnp.einsum("mn,pn->mp", x_jax, weight_jax)
+            if bias is not None and not layer.skip_bias_add:
+                out += jax_view(bias[i])
+            outs.append(out)
+        out = jnp.concatenate(outs, axis=-1)
+        return torch_view(out)
+class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
+    def __init__(self,
+                 moe: FusedMoEConfig,
+                 mesh: Mesh,
+                 ep_axis_name: str = 'model'):
+        super().__init__(moe)
+        self.mesh = mesh
+        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
+        self.ep_axis_name = ep_axis_name
+        # TODO: Use autotune table once we have it.
+        self.block_size = {
+            "bt": 64,
+            "bf": 1024,
+            "bd1": 1536,
+            "bd2": 1536,
+            "btc": 64,
+            "bfc": 1024,
+            "bd1c": 1536,
+            "bd2c": 1536,
+        }
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        raise NotImplementedError(
+            "Selecting gemm implementation is currently not supported.")
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        assert isinstance(layer, FusedMoE)
+        w13_weight = t2j(layer.w13_weight, use_dlpack=False)
+        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
+        num_experts, hidden_size, intermediate_size = w2_weight.shape
+        if self.moe.has_bias:
+            w13_bias = t2j(layer.w13_bias, use_dlpack=False)
+            w2_bias = t2j(layer.w2_bias, use_dlpack=False)
+        if layer.activation == "swigluoai":
+            # When using swigluoai, vLLM splits gmm output in a interleaved way.
+            # However, interleaved split is not performant on TPU. Therefore,
+            # we preprocess the weight so that splitting gmm output by middle
+            # can still get the same result.
+            w1_weight = w13_weight[:, ::2, :]
+            w3_weight = w13_weight[:, 1::2, :]
+            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
+            if self.moe.has_bias:
+                w1_bias = w13_bias[:, ::2]
+                w3_bias = w13_bias[:, 1::2]
+                w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
+        if self.use_kernel:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+            padded_intermediate_size = align_to(intermediate_size, 256)
+            padded_hidden_size = align_to(hidden_size, 256)
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w13_weight = w13_weight.reshape(num_experts, 2, intermediate_size,
+                                            hidden_size)
+            w13_weight = jnp.swapaxes(w13_weight, 3, 2)
+            w2_weight = jnp.swapaxes(w2_weight, 2, 1)
+            w13_weight = jnp.pad(
+                w13_weight,
+                ((0, 0), (0, 0), (0, padded_hidden_size - hidden_size),
+                 (0, padded_intermediate_size - intermediate_size)),
+                constant_values=0)
+            w2_weight = jnp.pad(
+                w2_weight,
+                ((0, 0), (0, padded_intermediate_size - intermediate_size),
+                 (0, padded_hidden_size - hidden_size)),
+                constant_values=0)
+            # Apply EP sharding
+            ep_sharding = NamedSharding(self.mesh, P("model"))
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            if self.moe.has_bias:
+                w13_bias = w13_bias.astype(jnp.float32).reshape(
+                    num_experts, 2, 1, intermediate_size)
+                w2_bias = w2_bias.astype(jnp.float32).reshape(
+                    num_experts, 1, hidden_size)
+                w13_bias = jnp.pad(
+                    w13_bias,
+                    ((0, 0), (0, 0), (0, 0),
+                     (0, padded_intermediate_size - intermediate_size)),
+                    constant_values=0)
+                w2_bias = jnp.pad(w2_bias,
+                                  ((0, 0), (0, 0),
+                                   (0, padded_hidden_size - hidden_size)),
+                                  constant_values=0)
+                # Apply EP sharding
+                w13_bias = jax.device_put(
+                    w13_bias, Format(Layout((0, 1, 2, 3)), ep_sharding))
+                w2_bias = jax.device_put(
+                    w2_bias, Format(Layout((0, 1, 2)), ep_sharding))
+        else:
+            if self.moe.has_bias:
+                w13_bias = jnp.expand_dims(w13_bias, 1)
+                w2_bias = jnp.expand_dims(w2_bias, 1)
+            if layer.use_ep:
+                ep_sharding = NamedSharding(self.mesh, P("model"))
+                w13_weight = jax.device_put(
+                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                w2_weight = jax.device_put(
+                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                if self.moe.has_bias:
+                    w13_bias = jax.device_put(
+                        w13_bias, Format(Layout((0, 1, 2)), ep_sharding))
+                    w2_bias = jax.device_put(
+                        w2_bias, Format(Layout((0, 1, 2)), ep_sharding))
+            else:
+                output_sizes = [intermediate_size, intermediate_size]
+                n_shards = self.mesh.shape["model"]
+                assert intermediate_size % n_shards == 0
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight, output_sizes, n_shards, dim=1)
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, "model", None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, None, "model"))))
+                if self.moe.has_bias:
+                    w13_bias = reorder_concatenated_tensor_for_sharding(
+                        w13_bias, output_sizes, n_shards, dim=2)
+                    w13_bias = jax.device_put(
+                        w13_bias,
+                        Format(
+                            Layout((0, 1, 2)),
+                            NamedSharding(self.mesh, P(None, None, "model"))))
+                    w2_bias = jax.device_put(
+                        w2_bias,
+                        Format(Layout((0, 1, 2)),
+                               NamedSharding(self.mesh, P(None, None, None))))
+        layer.w13_weight = Parameter(torch_view(w13_weight),
+                                     requires_grad=False)
+        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
+        if self.moe.has_bias:
+            layer.w13_bias = Parameter(torch_view(w13_bias),
+                                       requires_grad=False)
+            layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert isinstance(layer, FusedMoE)
+        if layer.scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax is supported for scoring_func")
+        x = jax_view(x)
+        w13_weight = jax_view(layer.w13_weight)
+        w2_weight = jax_view(layer.w2_weight)
+        w13_bias = w2_bias = None
+        if self.moe.has_bias:
+            w13_bias = jax_view(layer.w13_bias)
+            w2_bias = jax_view(layer.w2_bias)
+        gating_output = jax_view(router_logits)
+        if self.use_kernel:
+            actual_hidden_size = x.shape[-1]
+            padding_size = w13_weight.shape[-2] - actual_hidden_size
+            x = jnp.pad(x, ((0, 0), (0, padding_size)))
+            output = fused_ep_moe(
+                mesh=self.mesh,
+                tokens=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                b1=w13_bias,
+                b2=w2_bias,
+                gating_output=gating_output,
+                top_k=layer.top_k,
+                ep_axis_name=self.ep_axis_name,
+                renormalize_topk_logits=layer.renormalize,
+                act_fn=layer.activation,
+                **self.block_size,
+            )[:, :actual_hidden_size]
+        else:
+            output = fused_moe_func(
+                hidden_states=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                w1_scale=None,
+                w2_scale=None,
+                w1_bias=w13_bias,
+                w2_bias=w2_bias,
+                gating_output=gating_output,
+                topk=layer.top_k,
+                renormalize=layer.renormalize,
+                mesh=self.mesh,
+                use_ep=layer.use_ep,
+                activation=layer.activation,
+            )
+        return torch_view(output)

tpu_inference/layers/vllm/sharding.py ADDED Viewed

@@ -0,0 +1,244 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import jax
+import jax.numpy as jnp
+import torch
+import torchax
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torch.nn import Parameter
+from torch.utils import _pytree as pytree
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
+                              ReplicatedLinearWithLoRA,
+                              RowParallelLinearWithLoRA)
+from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from tpu_inference import envs
+from tpu_inference.logger import init_logger
+P = PartitionSpec
+logger = init_logger(__name__)
+TORCH_TO_JAX_DTYPE_MAP = {
+    torch.float32: jnp.float32,
+    torch.float16: jnp.float16,
+    torch.bfloat16: jnp.bfloat16,
+}
+def shard_model_to_tpu(model: torch.nn.Module,
+                       mesh: Mesh) -> dict[str, torchax.torch.Tensor]:
+    """
+    Shard the model weights and move them to TPU.
+    At the same time, also turn the weight tensors into torchax tensors so that
+    jax code can interop with it and the overall program can be traced and
+    compiled in XLA.
+    Args:
+        model: A PyTorch model whose weights are on CPU main memory.
+        mesh: JAX mesh object for sharding.
+    Returns:
+        Dictionary of parameters and buffers that will be used as arguments of
+        torch.func.functional_call
+    """
+    with jax.default_device(jax.devices("cpu")[0]):
+        _shard_module_to_tpu(model, mesh)
+        params, buffers = _extract_all_params_buffers(model)
+        # For other weight tensors, repliate them on all the TPU chips.
+        params, buffers = pytree.tree_map_only(
+            _tensor_is_in_cpu,
+            lambda tensor: _shard_tensor_to_tpu_replicated(tensor, mesh),
+            (params, buffers))
+        return {**params, **buffers}
+def update_lora(model: torch.nn.Module,
+                initial_params_buffers) -> dict[str, torchax.torch.Tensor]:
+    params, buffers = _extract_all_params_buffers(model)
+    params_buffers = {**params, **buffers}
+    for k, v in params_buffers.items():
+        if 'lora_a_stacked' in k or 'lora_b_stacked' in k:
+            assert k in initial_params_buffers, f"{k} not in initial_params_buffers"
+            initial_params_buffers[k] = v
+    return initial_params_buffers
+def _extract_all_params_buffers(model: torch.nn.Module):
+    return dict(model.named_parameters()), dict(model.named_buffers())
+def _tensor_is_in_cpu(tensor: torch.tensor) -> bool:
+    # Check if a tensor haven't been converted to torchax tensor.
+    if not isinstance(tensor, torchax.tensor.Tensor):
+        return True
+    # Check if torchax tensor is still in CPU.
+    return tensor.jax_device == jax.devices('cpu')[0]
+def _convert_to_torchax_and_shard(tensor: torch.Tensor,
+                                  sharding: NamedSharding) -> torch.Tensor:
+    if os.getenv("VLLM_TPU_USING_PATHWAYS", False) and isinstance(
+            tensor, torch.Tensor):
+        np_tensor = tensor.detach().cpu().to(torch.float32).numpy()
+        dtype = TORCH_TO_JAX_DTYPE_MAP.get(tensor.dtype, jnp.float32)
+        return torch_view(jax.device_put(np_tensor, sharding).astype(dtype))
+    else:
+        if isinstance(tensor, torchax.tensor.Tensor):
+            tensor = jax_view(tensor)
+        else:
+            tensor = t2j(tensor)
+        return torch_view(_sharded_device_put(tensor, sharding))
+def _shard_tensor_to_tpu_replicated(tensor: torch.Tensor,
+                                    mesh: Mesh) -> torchax.tensor.Tensor:
+    return _convert_to_torchax_and_shard(tensor, NamedSharding(mesh, P()))
+def _shard_vocab_parallel_embedding(layer: VocabParallelEmbedding,
+                                    mesh: Mesh) -> None:
+    weight = _convert_to_torchax_and_shard(
+        layer.weight, NamedSharding(mesh, P('model', None)))
+    layer.weight = Parameter(weight, requires_grad=False)
+def _shard_lm_head(layer: ParallelLMHead, mesh: Mesh):
+    # TODO(qihqi): currently this is not handling case of tie_word_weights=True.
+    # if that config is set, then we should not create new weights but reuse the
+    # weight from VocabParallelEmbedding
+    weight = _convert_to_torchax_and_shard(
+        layer.weight, NamedSharding(mesh, P('model', None)))
+    layer.weight = Parameter(weight, requires_grad=False)
+    if layer.bias is not None:
+        bias = _convert_to_torchax_and_shard(layer.bias,
+                                             NamedSharding(mesh, P('model')))
+        layer.bias = Parameter(bias, requires_grad=False)
+def _shard_base_linear_lora_replicated(layer: BaseLinearLayerWithLoRA,
+                                       mesh: Mesh) -> None:
+    # NOTE: lora_a_stacked[i] has shape [max_loras, 1, num_out, num_in]
+    sharded_lora_a_tpu = torch.nn.ParameterList()
+    sharded_lora_b_tpu = torch.nn.ParameterList()
+    for i in range(layer.n_slices):
+        sharded_lora_a_tpu.append(
+            _shard_tensor_to_tpu_replicated(layer.lora_a_stacked[i], mesh))
+        sharded_lora_b_tpu.append(
+            _shard_tensor_to_tpu_replicated(layer.lora_b_stacked[i], mesh))
+    layer.lora_a_stacked = sharded_lora_a_tpu
+    layer.lora_b_stacked = sharded_lora_b_tpu
+def _shard_column_linear_lora(layer: ColumnParallelLinearWithLoRA,
+                              mesh: Mesh) -> None:
+    assert layer.n_slices > 0, "layer.n_slices should be greater than 0"
+    # lora_a_stacked[i] has shape [max_loras, 1, max_lora_rank, in_features]
+    sharded_lora_a_tpu = torch.nn.ParameterList()
+    sharded_lora_b_tpu = torch.nn.ParameterList()
+    # lora_b_stacked[i] has shape [max_loras, 1, out_features, max_lora_rank]
+    lora_b_partition_spec = P(None, None, 'model', None)
+    lora_b_sharding = NamedSharding(mesh, lora_b_partition_spec)
+    for i in range(layer.n_slices):
+        sharded_lora_a_tpu.append(
+            _shard_tensor_to_tpu_replicated(layer.lora_a_stacked[i], mesh))
+        sharded_lora_b_tpu.append(
+            _convert_to_torchax_and_shard(layer.lora_b_stacked[i],
+                                          lora_b_sharding))
+    layer.lora_a_stacked = sharded_lora_a_tpu
+    layer.lora_b_stacked = sharded_lora_b_tpu
+def _shard_qkv_linear_lora(layer: ColumnParallelLinearWithLoRA,
+                           mesh: Mesh) -> None:
+    _shard_column_linear_lora(layer, mesh)
+def _shard_merged_column_parallel_linear_lora(
+        layer: MergedColumnParallelLinearWithLoRA, mesh: Mesh) -> None:
+    _shard_column_linear_lora(layer, mesh)
+def _shard_merged_qkv_parallel_linear_lora(
+        layer: MergedQKVParallelLinearWithLoRA, mesh: Mesh) -> None:
+    _shard_column_linear_lora(layer, mesh)
+def _shard_row_parallel_linear_lora(layer: RowParallelLinearWithLoRA,
+                                    mesh: Mesh) -> None:
+    _shard_base_linear_lora_replicated(layer, mesh)
+# NOTE: Ordering is important as it calls first matched type of a given module
+MODULE_TYPE_TO_SHARDING_FUNC = [
+    # Shard embedding layers
+    (ParallelLMHead, _shard_lm_head),
+    (VocabParallelEmbedding, _shard_vocab_parallel_embedding),
+    # Shard LoRA layers
+    (ColumnParallelLinearWithLoRA, _shard_column_linear_lora),
+    (QKVParallelLinearWithLoRA, _shard_qkv_linear_lora),
+    (MergedColumnParallelLinearWithLoRA,
+     _shard_merged_column_parallel_linear_lora),
+    (MergedQKVParallelLinearWithLoRA, _shard_merged_qkv_parallel_linear_lora),
+    (RowParallelLinearWithLoRA, _shard_row_parallel_linear_lora),
+    (ReplicatedLinearWithLoRA, _shard_base_linear_lora_replicated),
+]
+def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
+    for path, module in model.named_modules():
+        for module_type, sharding_func in MODULE_TYPE_TO_SHARDING_FUNC:
+            if type(module) is module_type:
+                logger.debug("shard %s with %s", path, sharding_func)
+                sharding_func(module, mesh)
+                break
+def _sharded_device_put(tensor: jax.Array, sharding) -> jax.Array:
+    if isinstance(tensor, tuple):
+        return tuple(_sharded_device_put(t, sharding) for t in tensor)
+    multihost_backend = envs.TPU_MULTIHOST_BACKEND
+    if multihost_backend != "ray":
+        return jax.device_put(tensor, sharding)
+    # NOTE: at here, num_global_devices != num_local_devices
+    # meaning we are in multi-host setup. Each host will run the same process
+    # and each process only need to handle the devices accessible to this host.
+    shape = tensor.shape
+    x_split = [
+        jax.device_put(tensor[i], device) for device, i in
+        sharding.addressable_devices_indices_map(shape).items()
+    ]
+    return jax.make_array_from_single_device_arrays(shape,
+                                                    sharding,
+                                                    x_split,
+                                                    dtype=tensor.dtype)

tpu_inference/logger.py ADDED Viewed

@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.logger import _VllmLogger
+from vllm.logger import init_logger as init_vllm_logger
+def init_logger(name: str) -> _VllmLogger:
+    # Prepend the root "vllm" to the module path to use vllm's configured logger.
+    patched_name = "vllm." + name
+    return init_vllm_logger(patched_name)

tpu_inference/lora/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.