PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +88 -25
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +45 -15
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +41 -16
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -1,4 +1,18 @@
-from typing import Any, Callable, Optional, Union
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Optional, Union
 import jax
 import jax.numpy as jnp
@@ -25,17 +39,23 @@ from tpu_inference import envs
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
+from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
 from tpu_inference.layers.vllm.quantization.common import (
     JaxCommonConfig, JaxCommonLinearConfig)
+from tpu_inference.utils import get_mesh_shape_product
 P = PartitionSpec
 logger = init_logger(__name__)
+def align_to(a, b):
+    return (a + b - 1) // b * b
 @register_quantization_config(get_tpu_quant_method(UNQUANTIZED))
 class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
@@ -168,7 +188,7 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                  ep_axis_name: str = 'model'):
         super().__init__(moe)
         self.mesh = mesh
-        self.use_kernel = envs.USE_MOE_EP_KERNEL
+        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
         self.ep_axis_name = ep_axis_name
         # TODO: Use autotune table once we have it.
         self.block_size = {
@@ -196,6 +216,8 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)
+        num_experts, hidden_size, intermediate_size = w2_weight.shape
         if self.moe.has_bias:
             w13_bias = t2j(layer.w13_bias, use_dlpack=False)
             w2_bias = t2j(layer.w2_bias, use_dlpack=False)
@@ -214,7 +236,7 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                 w3_bias = w13_bias[:, 1::2]
                 w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        if self.use_kernel and layer.use_ep:
+        if self.use_kernel:
             # Kernel expects:
             # w13: (num_experts, 2, hidden_size, intermediate_size)
             # w2: (num_experts, intermediate_size, hidden_size)
@@ -225,87 +247,119 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
             intermediate_size = w13_weight.shape[1] // 2
             hidden_size = w13_weight.shape[2]
-            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
-            w13_reshaped = w13_weight.reshape(num_experts, 2,
-                                              intermediate_size, hidden_size)
-            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
+            padded_intermediate_size = align_to(intermediate_size, 256)
+            padded_hidden_size = align_to(hidden_size, 256)
             # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
-            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+            w13_weight = w13_weight.reshape(num_experts, 2, intermediate_size,
+                                            hidden_size)
+            w13_weight = jnp.swapaxes(w13_weight, 3, 2)
+            w2_weight = jnp.swapaxes(w2_weight, 2, 1)
+            w13_weight = jnp.pad(
+                w13_weight,
+                ((0, 0), (0, 0), (0, padded_hidden_size - hidden_size),
+                 (0, padded_intermediate_size - intermediate_size)),
+                constant_values=0)
+            w2_weight = jnp.pad(
+                w2_weight,
+                ((0, 0), (0, padded_intermediate_size - intermediate_size),
+                 (0, padded_hidden_size - hidden_size)),
+                constant_values=0)
             # Apply EP sharding
+            ep_sharding = NamedSharding(self.mesh, P("model"))
             w13_weight = jax.device_put(
-                w13_weight_transposed,
+                w13_weight,
                 Format(Layout((0, 1, 2, 3)),
                        NamedSharding(self.mesh, P("model", None, None, None))))
             w2_weight = jax.device_put(
-                w2_weight_transposed,
+                w2_weight,
                 Format(Layout((0, 1, 2)),
                        NamedSharding(self.mesh, P("model", None, None))))
             if self.moe.has_bias:
-                w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+                w13_bias = w13_bias.astype(jnp.float32).reshape(
+                    num_experts, 2, 1, intermediate_size)
+                w2_bias = w2_bias.astype(jnp.float32).reshape(
+                    num_experts, 1, hidden_size)
+                w13_bias = jnp.pad(
+                    w13_bias,
+                    ((0, 0), (0, 0), (0, 0),
+                     (0, padded_intermediate_size - intermediate_size)),
+                    constant_values=0)
+                w2_bias = jnp.pad(w2_bias,
+                                  ((0, 0), (0, 0),
+                                   (0, padded_hidden_size - hidden_size)),
+                                  constant_values=0)
                 # Apply EP sharding
                 w13_bias = jax.device_put(
-                    w13_bias,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P("model", None, None))))
+                    w13_bias, Format(Layout((0, 1, 2, 3)), ep_sharding))
                 w2_bias = jax.device_put(
-                    w2_bias,
-                    Format(Layout((0, 1)),
-                           NamedSharding(self.mesh, P("model", None))))
+                    w2_bias, Format(Layout((0, 1, 2)), ep_sharding))
         else:
-            # Original logic for non-kernel path
+            if self.moe.has_bias:
+                w13_bias = jnp.expand_dims(w13_bias, 1)
+                w2_bias = jnp.expand_dims(w2_bias, 1)
             if layer.use_ep:
+                ep_sharding = NamedSharding(self.mesh,
+                                            P(ShardingAxisName.EXPERT))
                 w13_weight = jax.device_put(
-                    w13_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P("model", None, None))))
+                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
                 w2_weight = jax.device_put(
-                    w2_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P("model", None, None))))
+                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
                 if self.moe.has_bias:
                     w13_bias = jax.device_put(
-                        w13_bias,
-                        Format(Layout((0, 1)),
-                               NamedSharding(self.mesh, P("model", None))))
+                        w13_bias, Format(Layout((0, 1, 2)), ep_sharding))
                     w2_bias = jax.device_put(
-                        w2_bias,
-                        Format(Layout((0, 1)),
-                               NamedSharding(self.mesh, P("model", None))))
+                        w2_bias, Format(Layout((0, 1, 2)), ep_sharding))
             else:
-                intermediate_size = w13_weight.shape[1] // 2
-                assert intermediate_size == w2_weight.shape[-1]
                 output_sizes = [intermediate_size, intermediate_size]
-                n_shards = self.mesh.shape["model"]
+                n_shards = get_mesh_shape_product(self.mesh,
+                                                  ShardingAxisName.MLP_TENSOR)
                 assert intermediate_size % n_shards == 0
                 w13_weight = reorder_concatenated_tensor_for_sharding(
                     w13_weight, output_sizes, n_shards, dim=1)
                 w13_weight = jax.device_put(
                     w13_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, "model", None))))
+                    Format(
+                        Layout((0, 1, 2)),
+                        NamedSharding(
+                            self.mesh,
+                            P(None, ShardingAxisName.MLP_TENSOR, None))))
                 w2_weight = jax.device_put(
                     w2_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, None, "model"))))
+                    Format(
+                        Layout((0, 1, 2)),
+                        NamedSharding(
+                            self.mesh,
+                            P(None, None, ShardingAxisName.MLP_TENSOR))))
                 if self.moe.has_bias:
                     w13_bias = reorder_concatenated_tensor_for_sharding(
-                        w13_bias, output_sizes, n_shards, dim=1)
+                        w13_bias, output_sizes, n_shards, dim=2)
                     w13_bias = jax.device_put(
                         w13_bias,
-                        Format(Layout((0, 1)),
-                               NamedSharding(self.mesh, P(None, "model"))))
+                        Format(
+                            Layout((0, 1, 2)),
+                            NamedSharding(
+                                self.mesh,
+                                P(None, None, ShardingAxisName.MLP_TENSOR))))
                     w2_bias = jax.device_put(
                         w2_bias,
-                        Format(Layout((0, 1)),
-                               NamedSharding(self.mesh, P(None, None))))
+                        Format(Layout((0, 1, 2)),
+                               NamedSharding(self.mesh, P(None, None, None))))
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
@@ -321,60 +375,54 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert isinstance(layer, FusedMoE)
-        if scoring_func != "softmax":
+        if layer.scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        if self.use_kernel and layer.use_ep:
+        x = jax_view(x)
+        w13_weight = jax_view(layer.w13_weight)
+        w2_weight = jax_view(layer.w2_weight)
+        w13_bias = w2_bias = None
+        if self.moe.has_bias:
+            w13_bias = jax_view(layer.w13_bias)
+            w2_bias = jax_view(layer.w2_bias)
+        gating_output = jax_view(router_logits)
+        if self.use_kernel:
+            actual_hidden_size = x.shape[-1]
+            padding_size = w13_weight.shape[-2] - actual_hidden_size
+            x = jnp.pad(x, ((0, 0), (0, padding_size)))
             output = fused_ep_moe(
                 mesh=self.mesh,
-                tokens=jax_view(x),
-                w1=jax_view(layer.w13_weight),
-                w2=jax_view(layer.w2_weight),
-                b1=jax_view(layer.w13_bias) if self.moe.has_bias else None,
-                b2=jax_view(layer.w2_bias) if self.moe.has_bias else None,
-                gating_output=jax_view(router_logits),
-                top_k=top_k,
+                tokens=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                b1=w13_bias,
+                b2=w2_bias,
+                gating_output=gating_output,
+                top_k=layer.top_k,
                 ep_axis_name=self.ep_axis_name,
-                renormalize_topk_logits=renormalize,
-                act_fn=activation,
+                renormalize_topk_logits=layer.renormalize,
+                act_fn=layer.activation,
                 **self.block_size,
-            )
+            )[:, :actual_hidden_size]
         else:
-            # Use the original implementation
-            output = fused_moe_func_padded(
-                jax_view(x),
-                jax_view(layer.w13_weight),
-                jax_view(layer.w2_weight),
-                jax_view(layer.w13_bias) if self.moe.has_bias else None,
-                jax_view(layer.w2_bias) if self.moe.has_bias else None,
-                jax_view(router_logits),
-                topk=top_k,
-                global_num_experts=global_num_experts,
-                renormalize=renormalize,
-                reduce_results=layer.reduce_results,
+            output = fused_moe_func(
+                hidden_states=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                w1_scale=None,
+                w2_scale=None,
+                w1_bias=w13_bias,
+                w2_bias=w2_bias,
+                gating_output=gating_output,
+                topk=layer.top_k,
+                renormalize=layer.renormalize,
                 mesh=self.mesh,
                 use_ep=layer.use_ep,
-                activation=activation,
+                activation=layer.activation,
             )
         return torch_view(output)

tpu_inference/layers/vllm/sharding.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import jax
@@ -20,6 +34,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from tpu_inference import envs
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 P = PartitionSpec
@@ -109,7 +124,8 @@ def _shard_tensor_to_tpu_replicated(tensor: torch.Tensor,
 def _shard_vocab_parallel_embedding(layer: VocabParallelEmbedding,
                                     mesh: Mesh) -> None:
     weight = _convert_to_torchax_and_shard(
-        layer.weight, NamedSharding(mesh, P('model', None)))
+        layer.weight, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR,
+                                            None)))
     layer.weight = Parameter(weight, requires_grad=False)
@@ -118,11 +134,12 @@ def _shard_lm_head(layer: ParallelLMHead, mesh: Mesh):
     # if that config is set, then we should not create new weights but reuse the
     # weight from VocabParallelEmbedding
     weight = _convert_to_torchax_and_shard(
-        layer.weight, NamedSharding(mesh, P('model', None)))
+        layer.weight, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR,
+                                            None)))
     layer.weight = Parameter(weight, requires_grad=False)
     if layer.bias is not None:
-        bias = _convert_to_torchax_and_shard(layer.bias,
-                                             NamedSharding(mesh, P('model')))
+        bias = _convert_to_torchax_and_shard(
+            layer.bias, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR)))
         layer.bias = Parameter(bias, requires_grad=False)

tpu_inference/lora/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/lora/torch_lora_ops.py CHANGED Viewed

@@ -4,7 +4,6 @@
 import jax
 import jax.numpy as jnp
 import torch
-import torch.nn.functional as F
 from torchax.interop import call_jax
@@ -85,19 +84,15 @@ def bgmv_expand_slice(
         add_inputs (bool): Whether or not to add the input tensor to the output
             tensor.
     """
-    outputs = bgmv_torch(inputs, lora_b_weights, lora_indices_tensor)
+    outputs = bgmv_torch(inputs, lora_b_weights,
+                         lora_indices_tensor)  # [num_tokens, out_features]
-    outputs = F.pad(
-        outputs,
-        (
-            slice_offset,
-            output_tensor.shape[1] - (slice_offset + slice_size),
-            0,
-            0,
-        ),
-    )
+    # Create a padded tensor manually to avoid issues with F.pad on sharded tensors.
+    # This is a more robust way to handle padding in a distributed environment.
+    outputs_padded = torch.zeros_like(output_tensor)
+    outputs_padded[:, slice_offset:slice_offset + slice_size] = outputs
     if add_inputs:
-        return output_tensor + outputs
+        return output_tensor + outputs_padded
     else:
-        return outputs
+        return outputs_padded

tpu_inference/models/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/models/common/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl