PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (248) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +14 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +14 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +180 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +25 -8
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +14 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +20 -3
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +171 -163
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +20 -26
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +112 -69
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +85 -65
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +374 -194
tpu_inference/kernels/ragged_paged_attention/v3/util.py +13 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +22 -3
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +52 -27
tpu_inference/layers/jax/attention/gpt_oss_attention.py +19 -6
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +100 -455
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +19 -5
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +119 -132
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +38 -26
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +133 -220
tpu_inference/layers/vllm/quantization/unquantized.py +154 -253
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +37 -16
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +113 -124
tpu_inference/models/jax/gpt_oss.py +23 -7
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +85 -24
tpu_inference/models/jax/utils/weight_utils.py +32 -1
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +27 -29
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +69 -35
tpu_inference/runner/kv_cache.py +14 -0
tpu_inference/runner/kv_cache_manager.py +15 -2
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +30 -10
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +31 -30
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +23 -7
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +1 -1
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -208
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.12.0.dev20251213.dist-info/RECORD +0 -175
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -1,19 +1,29 @@
-from typing import Any, Optional, Union
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional
 import jax
 import jax.numpy as jnp
 import torch
-from jax.experimental.layout import Format, Layout
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from torch.nn.parameter import Parameter
 from torchax.interop import jax_view, torch_view
 from torchax.ops.mappings import t2j
 from vllm.attention.layer import Attention
-from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEConfig, UnquantizedFusedMoEMethod)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import \
@@ -21,27 +31,31 @@ from vllm.model_executor.layers.quantization import \
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from tpu_inference import envs
-from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func
-from tpu_inference.layers.vllm.linear_common import (
-    reorder_concatenated_tensor_for_sharding,
-    slice_sharded_tensor_for_concatenation, torch_to_jax_param)
-from tpu_inference.layers.vllm.quantization.common import (
-    JaxCommonConfig, JaxCommonLinearConfig)
+from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.common.utils import \
+    slice_sharded_tensor_for_concatenation
+from tpu_inference.layers.vllm.fused_moe import (FusedMoEBackend,
+                                                 fused_moe_apply,
+                                                 select_moe_backend)
+from tpu_inference.layers.vllm.process_weights.fused_moe_weights import (
+    FusedMoEWeights, process_moe_weights, shard_moe_weights)
+from tpu_inference.layers.vllm.process_weights.linear_weights import (
+    LinearWeights, process_lienar_weights, shard_linear_weights,
+    to_parameter_list)
+from tpu_inference.layers.vllm.quantization.configs import (
+    VllmQuantConfig, VllmQuantLinearConfig)
+from tpu_inference.logger import init_logger
+from tpu_inference.utils import get_mesh_shape_product
 P = PartitionSpec
-logger = init_logger(__name__)
-def align_to(a, b):
-    return (a + b - 1) // b * b
+logger = init_logger(__name__)
 @register_quantization_config(get_tpu_quant_method(UNQUANTIZED))
-class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
+class VllmUnquantizedConfig(QuantizationConfig, VllmQuantConfig):
     @classmethod
     def get_name(cls) -> str:
@@ -78,35 +92,54 @@ class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
 class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
-    def __init__(self, jax_config: JaxCommonLinearConfig):
-        self.jax_config = jax_config
+    def __init__(self, linear_config: VllmQuantLinearConfig):
+        self.linear_config = linear_config
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        weight = torch_to_jax_param(
-            layer.weight,
-            NamedSharding(self.jax_config.mesh,
-                          self.jax_config.weight_sharding),
-            self.jax_config.output_sizes,
-            self.jax_config.n_shards,
-            self.jax_config.fuse_matmuls,
-        )
+        weight = t2j(layer.weight, use_dlpack=False)
         delattr(layer, "weight")
-        layer.weight = weight
         if layer.bias is not None and not layer.skip_bias_add:
             if layer.return_bias:
                 logger.warning_once("Bias might return incorrect value.")
-            bias = torch_to_jax_param(
-                layer.bias,
-                NamedSharding(self.jax_config.mesh,
-                              self.jax_config.bias_sharding),
-                self.jax_config.output_sizes,
-                self.jax_config.n_shards,
-                self.jax_config.fuse_matmuls,
-            )
+            bias = t2j(layer.bias, use_dlpack=False)
             delattr(layer, "bias")
-            layer.bias = bias
+        else:
+            bias = None
+        @jax.jit
+        def process_unquantized_linear_weights(
+            weight: jax.Array,
+            bias: jax.Array | None,
+        ) -> LinearWeights:
+            return process_lienar_weights(
+                LinearWeights(
+                    weight=weight,
+                    weight_scale=None,
+                    zero_point=None,
+                    bias=bias,
+                ),
+                fused=self.linear_config.fuse_matmuls,
+                output_sizes=self.linear_config.output_sizes,
+                reorder_size=self.linear_config.n_shards,
+            )
+        weights = process_unquantized_linear_weights(weight, bias)
+        weights = torch_view(
+            shard_linear_weights(
+                weights,
+                mesh=self.linear_config.mesh,
+                weight_p_spec=self.linear_config.weight_sharding,
+                bias_p_spec=self.linear_config.bias_sharding,
+            ))
+        if self.linear_config.fuse_matmuls:
+            layer.weight = Parameter(weights.weight, requires_grad=False)
+            if bias is not None:
+                layer.bias = Parameter(weights.bias, requires_grad=False)
+        else:
+            layer.weight = to_parameter_list(weights.weight)
+            if bias is not None:
+                layer.bias = to_parameter_list(weights.bias)
     def apply(self,
               layer: torch.nn.Module,
@@ -115,16 +148,17 @@ class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
         assert isinstance(layer, LinearBase)
         with jax.named_scope(layer._get_name()):
-            if in_sharding := self.jax_config.get_input_sharding(x):
-                x.shard_(NamedSharding(self.jax_config.mesh, in_sharding))
+            if in_sharding := self.linear_config.get_input_sharding(x):
+                x.shard_(NamedSharding(self.linear_config.mesh, in_sharding))
-            if self.jax_config.fuse_matmuls:
+            if self.linear_config.fuse_matmuls:
                 out = self._apply_fused(layer, x, bias)
             else:
                 out = self._apply_split(layer, x, bias)
-            if out_sharding := self.jax_config.get_output_sharding(out):
-                out.shard_(NamedSharding(self.jax_config.mesh, out_sharding))
+            if out_sharding := self.linear_config.get_output_sharding(out):
+                out.shard_(NamedSharding(self.linear_config.mesh,
+                                         out_sharding))
         return out
@@ -140,7 +174,7 @@ class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
             outs += bias.jax()
         outs = slice_sharded_tensor_for_concatenation(
-            outs, self.jax_config.output_sizes, self.jax_config.n_shards)
+            outs, self.linear_config.output_sizes, self.linear_config.n_shards)
         out = jnp.concatenate(outs, axis=-1)
         return torch_view(out)
@@ -166,232 +200,99 @@ class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
 class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
-    def __init__(self,
-                 moe: FusedMoEConfig,
-                 mesh: Mesh,
-                 ep_axis_name: str = 'model'):
-        super().__init__(moe)
-        self.mesh = mesh
-        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
-        self.ep_axis_name = ep_axis_name
-        # TODO: Use autotune table once we have it.
-        self.block_size = {
-            "bt": 64,
-            "bf": 1024,
-            "bd1": 1536,
-            "bd2": 1536,
-            "btc": 64,
-            "bfc": 1024,
-            "bd1c": 1536,
-            "bd2c": 1536,
-        }
-    def select_gemm_impl(
+    def __init__(
         self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        raise NotImplementedError(
-            "Selecting gemm implementation is currently not supported.")
+        mesh: Mesh,
+        ep_axis_name: str = "model",
+    ):
+        super().__init__(moe)
+        self.mesh = mesh
+        self.moe_backend = select_moe_backend(self.moe)
+        self.extra_backend_kwargs = {}
+        if self.moe_backend == FusedMoEBackend.FUSED_MOE:
+            # When fused moe kernle is used, we pass extra arguments like
+            # tuned block sizes to the kernel.
+            self.extra_backend_kwargs = dict(
+                ep_axis_name=ep_axis_name,
+                # TODO: Use autotune table once we have it.
+                bt=64,
+                bf=1024,
+                bd1=1536,
+                bd2=1536,
+                btc=64,
+                bfc=1024,
+                bd1c=1536,
+                bd2c=1536,
+            )
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-        num_experts, hidden_size, intermediate_size = w2_weight.shape
         if self.moe.has_bias:
             w13_bias = t2j(layer.w13_bias, use_dlpack=False)
             w2_bias = t2j(layer.w2_bias, use_dlpack=False)
-        if layer.activation == "swigluoai":
-            # When using swigluoai, vLLM splits gmm output in a interleaved way.
-            # However, interleaved split is not performant on TPU. Therefore,
-            # we preprocess the weight so that splitting gmm output by middle
-            # can still get the same result.
-            w1_weight = w13_weight[:, ::2, :]
-            w3_weight = w13_weight[:, 1::2, :]
-            w13_weight = jnp.concat([w1_weight, w3_weight], axis=1)
-            if self.moe.has_bias:
-                w1_bias = w13_bias[:, ::2]
-                w3_bias = w13_bias[:, 1::2]
-                w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        if self.use_kernel:
-            # Kernel expects:
-            # w13: (num_experts, 2, hidden_size, intermediate_size)
-            # w2: (num_experts, intermediate_size, hidden_size)
-            # Current format:
-            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
-            # w2_weight: (num_experts, hidden_size, intermediate_size)
-            num_experts = w13_weight.shape[0]
-            intermediate_size = w13_weight.shape[1] // 2
-            hidden_size = w13_weight.shape[2]
-            padded_intermediate_size = align_to(intermediate_size, 256)
-            padded_hidden_size = align_to(hidden_size, 256)
-            w13_weight = w13_weight.reshape(num_experts, 2, intermediate_size,
-                                            hidden_size)
-            w13_weight = jnp.transpose(w13_weight, (0, 1, 3, 2))
-            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
-            w2_weight = jnp.transpose(w2_weight, (0, 2, 1))
-            w13_weight = jnp.pad(
-                w13_weight,
-                ((0, 0), (0, 0), (0, padded_hidden_size - hidden_size),
-                 (0, padded_intermediate_size - intermediate_size)),
-                constant_values=0)
-            w2_weight = jnp.pad(
-                w2_weight,
-                ((0, 0), (0, padded_intermediate_size - intermediate_size),
-                 (0, padded_hidden_size - hidden_size)),
-                constant_values=0)
-            # Apply EP sharding
-            ep_sharding = NamedSharding(self.mesh, P("model"))
-            w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2, 3)),
-                       NamedSharding(self.mesh, P("model", None, None, None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
-            if self.moe.has_bias:
-                w13_bias = w13_bias.astype(jnp.float32).reshape(
-                    num_experts, 2, 1, intermediate_size)
-                w2_bias = w2_bias.astype(jnp.float32).reshape(
-                    num_experts, 1, hidden_size)
-                w13_bias = jnp.pad(
-                    w13_bias,
-                    ((0, 0), (0, 0), (0, 0),
-                     (0, padded_intermediate_size - intermediate_size)),
-                    constant_values=0)
-                w2_bias = jnp.pad(w2_bias,
-                                  ((0, 0), (0, 0),
-                                   (0, padded_hidden_size - hidden_size)),
-                                  constant_values=0)
-                # Apply EP sharding
-                w13_bias = jax.device_put(
-                    w13_bias, Format(Layout((0, 1, 2, 3)), ep_sharding))
-                w2_bias = jax.device_put(
-                    w2_bias, Format(Layout((0, 1, 2)), ep_sharding))
         else:
+            w13_bias = w2_bias = None
+        @jax.jit
+        def process_unquantized_moe_weights(
+            w13_weight: jax.Array,
+            w13_bias: jax.Array | None,
+            w2_weight: jax.Array,
+            w2_bias: jax.Array | None,
+        ) -> FusedMoEWeights:
+            w13_interleave = layer.activation == "swigluoai"
+            w13_reorder_size = get_mesh_shape_product(
+                self.mesh, ShardingAxisName.MLP_TENSOR)
+            return process_moe_weights(
+                FusedMoEWeights(
+                    w13_weight=w13_weight,
+                    w13_weight_scale=None,
+                    w13_bias=w13_bias,
+                    w2_weight=w2_weight,
+                    w2_weight_scale=None,
+                    w2_bias=w2_bias,
+                ),
+                moe_backend=self.moe_backend,
+                w13_reorder_size=w13_reorder_size,
+                w13_interleave=w13_interleave,
+            )
-            if layer.use_ep:
-                ep_sharding = NamedSharding(self.mesh, P("model"))
-                w13_weight = jax.device_put(
-                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
-                w2_weight = jax.device_put(
-                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
+        weights = process_unquantized_moe_weights(
+            w13_weight,
+            w13_bias,
+            w2_weight,
+            w2_bias,
+        )
+        weights = torch_view(
+            shard_moe_weights(weights, self.moe_backend, self.mesh))
-                if self.moe.has_bias:
-                    w13_bias = jax.device_put(
-                        w13_bias, Format(Layout((0, 1)), ep_sharding))
-                    w2_bias = jax.device_put(
-                        w2_bias, Format(Layout((0, 1)), ep_sharding))
-            else:
-                output_sizes = [intermediate_size, intermediate_size]
-                n_shards = self.mesh.shape["model"]
-                assert intermediate_size % n_shards == 0
-                w13_weight = reorder_concatenated_tensor_for_sharding(
-                    w13_weight, output_sizes, n_shards, dim=1)
-                w13_weight = jax.device_put(
-                    w13_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, "model", None))))
-                w2_weight = jax.device_put(
-                    w2_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, None, "model"))))
-                if self.moe.has_bias:
-                    w13_bias = reorder_concatenated_tensor_for_sharding(
-                        w13_bias, output_sizes, n_shards, dim=1)
-                    w13_bias = jax.device_put(
-                        w13_bias,
-                        Format(Layout((0, 1)),
-                               NamedSharding(self.mesh, P(None, "model"))))
-                    w2_bias = jax.device_put(
-                        w2_bias,
-                        Format(Layout((0, 1)),
-                               NamedSharding(self.mesh, P(None, None))))
-        layer.w13_weight = Parameter(torch_view(w13_weight),
-                                     requires_grad=False)
-        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
+        layer.w13_weight = Parameter(weights.w13_weight, requires_grad=False)
+        layer.w2_weight = Parameter(weights.w2_weight, requires_grad=False)
         if self.moe.has_bias:
-            layer.w13_bias = Parameter(torch_view(w13_bias),
-                                       requires_grad=False)
-            layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
+            layer.w13_bias = Parameter(weights.w13_bias, requires_grad=False)
+            layer.w2_bias = Parameter(weights.w2_bias, requires_grad=False)
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert isinstance(layer, FusedMoE)
-        if layer.scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax is supported for scoring_func")
-        x = jax_view(x)
-        w13_weight = jax_view(layer.w13_weight)
-        w2_weight = jax_view(layer.w2_weight)
-        w13_bias = w2_bias = None
-        if self.moe.has_bias:
-            w13_bias = jax_view(layer.w13_bias)
-            w2_bias = jax_view(layer.w2_bias)
-        gating_output = jax_view(router_logits)
-        if self.use_kernel:
-            actual_hidden_size = x.shape[-1]
-            padded_hidden_size = align_to(actual_hidden_size, 256)
-            x = jnp.pad(x,
-                        ((0, 0), (0, padded_hidden_size - actual_hidden_size)),
-                        constant_values=0)
-            output = fused_ep_moe(
-                mesh=self.mesh,
-                tokens=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                b1=w13_bias,
-                b2=w2_bias,
-                gating_output=gating_output,
-                top_k=layer.top_k,
-                ep_axis_name=self.ep_axis_name,
-                renormalize_topk_logits=layer.renormalize,
-                act_fn=layer.activation,
-                **self.block_size,
-            )[:, :actual_hidden_size]
-        else:
-            output = fused_moe_func(
-                hidden_states=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                w1_bias=w13_bias,
-                w2_bias=w2_bias,
-                gating_output=gating_output,
-                topk=layer.top_k,
-                renormalize=layer.renormalize,
-                mesh=self.mesh,
-                use_ep=layer.use_ep,
-                activation=layer.activation,
-            )
-        return torch_view(output)
+    ) -> torch.Tensor:
+        return fused_moe_apply(
+            layer,
+            x,
+            router_logits,
+            self.moe_backend,
+            self.mesh,
+            self.extra_backend_kwargs,
+        )

tpu_inference/lora/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/lora/torch_lora_ops.py CHANGED Viewed

@@ -4,7 +4,6 @@
 import jax
 import jax.numpy as jnp
 import torch
-import torch.nn.functional as F
 from torchax.interop import call_jax
@@ -85,19 +84,15 @@ def bgmv_expand_slice(
         add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
-    outputs = bgmv_torch(inputs, lora_b_weights, lora_indices_tensor)
+    outputs = bgmv_torch(inputs, lora_b_weights,
+                         lora_indices_tensor)  # [num_tokens, out_features]
-    outputs = F.pad(
-        outputs,
-        (
-            slice_offset,
-            output_tensor.shape[1] - (slice_offset + slice_size),
-            0,
-            0,
-        ),
-    )
+    # Create a padded tensor manually to avoid issues with F.pad on sharded tensors.
+    # This is a more robust way to handle padding in a distributed environment.
+    outputs_padded = torch.zeros_like(output_tensor)
+    outputs_padded[:, slice_offset:slice_offset + slice_size] = outputs
     if add_inputs:
-        return output_tensor + outputs
+        return output_tensor + outputs_padded
     else:
-        return outputs
+        return outputs_padded

tpu_inference/models/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/models/common/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu-inference 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl