PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +88 -25
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +45 -15
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +41 -16
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -1,203 +1,266 @@
-from typing import Callable, Optional, Union
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
 import jax
 import jax.numpy as jnp
 import torch
-import torch.nn.functional as F
+from compressed_tensors.quantization import QuantizationArgs
 from jax.experimental.layout import Format, Layout
 from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 from torch.nn.parameter import Parameter
-from torchax.interop import call_jax, torch_view
+from torchax.interop import jax_view, torch_view
 from torchax.ops.mappings import t2j
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import \
-    CompressedTensorsConfig
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import \
-    CompressedTensorsW8A8Fp8MoEMethod
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
-    WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+    CompressedTensorsMoEMethod, CompressedTensorsW8A8Fp8MoEMethod)
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func
+from tpu_inference.layers.vllm.linear_common import \
+    reorder_concatenated_tensor_for_sharding
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedFusedMoEMethod
 logger = init_logger(__name__)
+class VllmCompressedTensorsMoEMethod(CompressedTensorsMoEMethod):
+    @staticmethod
+    def get_moe_method(
+        quant_config: "VllmCompressedTensorsConfig",  # type: ignore # noqa E501
+        layer: torch.nn.Module,
+        layer_name: str,
+    ) -> CompressedTensorsMoEMethod:
+        assert isinstance(layer, FusedMoE)
+        # FusedMoE was made by combining multiple Linears so need to
+        # make sure quantization config for Linear can target it
+        quant_config._add_fused_moe_to_target_scheme_map()
+        unfused_names = [
+            layer_name + proj_name
+            for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
+        ]
+        # TODO: refactor this to use expert_mapping and check all layer numbers
+        all_scheme_dicts = [
+            quant_config.get_scheme_dict(layer, name) for name in unfused_names
+        ]
+        scheme_dict = all_scheme_dicts.pop()
+        # multiple schemes found
+        if not all([cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
+            raise ValueError("All MoE projections need to have same "
+                             "quantization scheme but found multiple")
+        if scheme_dict is None:
+            return VllmUnquantizedFusedMoEMethod(layer.moe_config,
+                                                 quant_config.mesh)
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
+        if quant_config._is_fp8_w8a8(weight_quant, input_quant):
+            return VllmCompressedTensorsW8A8Fp8MoEMethod(
+                weight_quant, input_quant, layer.moe_config, quant_config.mesh)
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
 class VllmCompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsW8A8Fp8MoEMethod,
                                             JaxCommonConfig):
-    def __init__(self, quant_config: "CompressedTensorsConfig",
-                 moe: FusedMoEConfig, mesh: Mesh):
-        super().__init__(quant_config, moe)
+    def __init__(
+        self,
+        weight_quant: QuantizationArgs,
+        input_quant: QuantizationArgs,
+        moe: FusedMoEConfig,
+        mesh: Mesh,
+    ):
+        super().__init__(weight_quant, input_quant, moe)
         self.mesh = mesh
-        self.quant_config = quant_config
-        # disable GPU paths
-        self.use_marlin = False
-        self.rocm_aiter_moe_enabled = False  # is_rocm_aiter_moe_enabled()
-        self.is_fp8_w8a8_sm100 = False
-        self.use_cutlass = False
-        self.disable_expert_map = False
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Docstring for process_weights_after_loading
+        :param self: Description
+        :param layer: Description
+        :type layer: torch.nn.Module
+        Steps:
+        1. Read weights from layer object and convert to jax arrays
+        2. Interleave concat w13 weights
+        3. Shard weights for tp (rowwise w13, colwise w2)
+        4. Initialize Params as torch.nn.Parameter
+            a. w13_weight - float8_e4m3fn shape: (num_experts, 2 x intermediate_size, input_size)
+            b. w2_weight - float8_e4m3fn shape: (num_experts, output_size, intermediate_size)
+            c. w13_weight_scale - FP32 shape: (num_experts, 2 x intermediate_size, 1)
+            d. w2_weight_scale - FP32shape: (num_experts, output_size, 1)
+        """
         assert isinstance(layer, FusedMoE)
+        # Read weights from layer object
+        w13_weight = t2j(
+            layer.w13_weight, use_dlpack=False
+        )  # float8_e4m3fn shape: (num_experts, 2 x intermediate_size, input_size)
+        w13_weight_scale = t2j(
+            layer.w13_weight_scale, use_dlpack=False
+        )  # FP32 shape: (num_experts, 2 x intermediate_size, 1)
+        w2_weight = t2j(
+            layer.w2_weight, use_dlpack=False
+        )  # float8_e4m3fn shape: (num_experts, output_size, intermediate_size)
+        w2_weight_scale = t2j(layer.w2_weight_scale, use_dlpack=False)
+        w13_weight_scale = w13_weight_scale.astype(jnp.bfloat16)
+        w2_weight_scale = w2_weight_scale.astype(jnp.bfloat16)
         intermediate_size = layer.w13_weight.shape[1] // 2
-        w1_weight = layer.w13_weight[:, :intermediate_size]
-        w3_weight = layer.w13_weight[:, intermediate_size:]
-        w1_weight_scale = layer.w13_weight_scale[:, :intermediate_size]
-        w3_weight_scale = layer.w13_weight_scale[:, intermediate_size:]
-        w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-        w2_weight_scale = t2j(layer.w2_weight_scale.to(torch.bfloat16),
-                              use_dlpack=False)
-        w1_weight = t2j(w1_weight, use_dlpack=False)
-        w1_weight_scale = t2j(w1_weight_scale.to(torch.bfloat16),
-                              use_dlpack=False)
-        w3_weight = t2j(w3_weight, use_dlpack=False)
-        w3_weight_scale = t2j(w3_weight_scale.to(torch.bfloat16),
-                              use_dlpack=False)
+        assert intermediate_size == w2_weight.shape[-1]
+        n_shards = self.mesh.shape["model"]
+        assert intermediate_size % n_shards == 0
+        num_experts, hidden_size, intermediate_size = w2_weight.shape
+        assert w2_weight_scale.shape == (num_experts, hidden_size, 1)
+        assert w13_weight.shape == (num_experts, 2 * intermediate_size,
+                                    hidden_size)
+        assert w13_weight_scale.shape == (num_experts, 2 * intermediate_size,
+                                          1)
+        if not layer.use_ep:
+            # Interleave concat w13 weights
+            w13_weight = reorder_concatenated_tensor_for_sharding(
+                w13_weight,
+                split_sizes=(intermediate_size, intermediate_size),
+                dim=1,
+                n_shards=n_shards,
+            )
+            # Interleave concat w13 weight scales
+            w13_weight_scale = reorder_concatenated_tensor_for_sharding(
+                w13_weight_scale,
+                split_sizes=(intermediate_size, intermediate_size),
+                dim=1,
+                n_shards=n_shards,
+            )
+        # 160,5120,1 -> 160,1,5120
+        w13_weight_scale = jnp.swapaxes(w13_weight_scale, 1, 2)
+        # 160,1,5120 -> 160, 1, 1, 5120   (num_experts, num_blocks, 1, outer_dim)
+        w13_weight_scale = jnp.expand_dims(w13_weight_scale, 2)
+        w2_weight_scale = jnp.swapaxes(w2_weight_scale, 1, 2)
+        w2_weight_scale = jnp.expand_dims(w2_weight_scale, 2)
         if layer.use_ep:
-            format = Format(Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P("model", None, None)))
-            w1_weight = jax.device_put(w1_weight, format)
-            w1_weight_scale = jax.device_put(w1_weight_scale, format)
-            w3_weight = jax.device_put(w3_weight, format)
-            w3_weight_scale = jax.device_put(w3_weight_scale, format)
-            w2_weight = jax.device_put(w2_weight, format)
-            w2_weight_scale = jax.device_put(w2_weight_scale, format)
-        else:
-            assert intermediate_size == w2_weight.shape[-1]
-            n_shards = self.mesh.shape["model"]
-            assert intermediate_size % n_shards == 0
+            # Apply EP sharding
+            ep_sharding = NamedSharding(self.mesh, P("model"))
+            w13_weight = jax.lax.with_sharding_constraint(
+                w13_weight, ep_sharding)
+            w2_weight = jax.lax.with_sharding_constraint(
+                w2_weight, ep_sharding)
-            # TODO: enable this if using fused weights
-            # output_sizes = [intermediate_size, intermediate_size]
-            # w13_weight = reorder_concatenated_tensor_for_sharding(
-            #    w13_weight, output_sizes, n_shards, dim=1
-            # )
+            w13_weight_scale = jax.lax.with_sharding_constraint(
+                w13_weight_scale, ep_sharding)
+            w2_weight_scale = jax.lax.with_sharding_constraint(
+                w2_weight_scale, ep_sharding)
+        else:
+            # Shard weights for tp (rowwise w13, colwise w2)
             w13_format = Format(
-                Layout((0, 1, 2)),
-                NamedSharding(self.mesh, P(None, "model", None)))
-            w1_weight = jax.device_put(w1_weight, w13_format)
-            w1_weight_scale = jax.device_put(w1_weight_scale, w13_format)
-            w3_weight = jax.device_put(w3_weight, w13_format)
-            w3_weight_scale = jax.device_put(w3_weight_scale, w13_format)
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, None, "model"))),
+                Layout((0, 1, 2)),  # expert, 2xintermed, input
+                NamedSharding(self.mesh, P(None, "model", None)),
+            )  # rowwise sharding on intermed dim
+            w13_scale_format = Format(
+                Layout(
+                    (0, 1, 2, 3)),  #  (num_experts, num_blocks, 1, outer_dim)
+                NamedSharding(self.mesh, P(None, None, None, "model")),
+            )  # col wise GMM sharding on intermed dim
+            # Local shard shape: (num_experts, 2 x (intermediate_size // n_shards), input_size)
+            w13_weight = jax.lax.with_sharding_constraint(
+                w13_weight, w13_format)
+            # Local shard shape: (num_experts, (intermediate_size // n_shards), 1)
+            w13_weight_scale = jax.lax.with_sharding_constraint(
+                w13_weight_scale, w13_scale_format)
+            # Shard weights for tp (colwise w2)
+            w2_format = Format(
+                Layout((0, 1, 2)),  # expert, intermed, hidden
+                NamedSharding(self.mesh, P(None, None, "model")),
             )
-            w2_weight_scale = jax.device_put(
-                w2_weight_scale,
-                Format(Layout((0, 1, 2)), NamedSharding(self.mesh, P())),
-            )  # replicate
+            # Local shard shape: (num_experts, hidden, (intermediate_size // n_shards))
+            # #  (num_experts, num_blocks, 1, outer_dim)
+            w2_weight = jax.lax.with_sharding_constraint(w2_weight, w2_format)
-        w1_weight = Parameter(torch_view(w1_weight), requires_grad=False)
-        w1_weight_scale = Parameter(torch_view(w1_weight_scale),
-                                    requires_grad=False)
+            w2_scale_format = Format(
+                Layout((0, 1, 2, 3)),  # expert, intermed, 1
+                NamedSharding(self.mesh, P(None, None, None, None)),
+            )
+            # Local shard shape: (num_experts, intermediate_size // n_shards, 1)
+            w2_weight_scale = jax.lax.with_sharding_constraint(
+                w2_weight_scale, w2_scale_format)
+        w13_weight = Parameter(torch_view(w13_weight), requires_grad=False)
+        w13_weight_scale = Parameter(torch_view(w13_weight_scale),
+                                     requires_grad=False)
         w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
         w2_weight_scale = Parameter(torch_view(w2_weight_scale),
                                     requires_grad=False)
-        w3_weight = Parameter(torch_view(w3_weight), requires_grad=False)
-        w3_weight_scale = Parameter(torch_view(w3_weight_scale),
-                                    requires_grad=False)
-        # TODO dont reuse variable
-        layer.w13_weight = w1_weight
-        layer.w13_weight_scale = w1_weight_scale
+        layer.w13_weight = w13_weight
+        layer.w13_weight_scale = w13_weight_scale
         layer.w2_weight = w2_weight
         layer.w2_weight_scale = w2_weight_scale
-        layer.w3_weight = w3_weight
-        layer.w3_weight_scale = w3_weight_scale
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert isinstance(layer, FusedMoE)
-        if activation != "silu":
+        if layer.activation != "silu":
             raise NotImplementedError(
                 "Only silu is supported for activation function.")
-        if scoring_func != "softmax":
+        if layer.scoring_func != "softmax":
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        # import sys
-        # sys.stdin = open(0)
-        # breakpoint()
         # TODO: Use MoE kernel when it supports fp8
-        seqlen = x.shape[0]
-        expert_weights = F.softmax(router_logits, dim=-1)
-        expert_weights, expert_indices = torch.topk(expert_weights,
-                                                    top_k,
-                                                    dim=-1)
-        if renormalize:
-            expert_weights /= expert_weights.sum(dim=-1, keepdim=True)
-        # cond ffn
-        # e = total num of exp = 160
-        # t = seqlen
-        # o = config.imtermediate size
-        # i = config.dim
-        #torch.einsum("ti, eoi -> teo", x, layer.w13_weight) * self.w13_weight_scale)
-        ux1 = call_jax(jax.lax.dot,
-                       x,
-                       layer.w13_weight,
-                       dimension_numbers=(((1, ), (2, )), ((), ())),
-                       preferred_element_type=jnp.bfloat16.dtype)
-        x1 = F.silu(ux1 * layer.w13_weight_scale.squeeze(2))
-        #x3 = torch.einsum("ti, eoi -> teo", x, layer.w3_weight) * self.w3_weight_scale
-        x3 = call_jax(jax.lax.dot,
-                      x,
-                      layer.w3_weight,
-                      dimension_numbers=(((1, ), (2, )), ((), ())),
-                      preferred_element_type=jnp.bfloat16.dtype
-                      ) * layer.w3_weight_scale.squeeze(2)
-        #expert_outs = torch.einsum("teo, eio -> tei", (x1 * x3), self.w2_weight) * self.w2_weight_scale
-        expert_outs = call_jax(
-            jax.lax.dot,
-            x1 * x3,
-            layer.w2_weight,
-            dimension_numbers=(((2, ), (2, )), ((1, ), (0, ))),
-            preferred_element_type=jnp.bfloat16.dtype).transpose(
-                0, 1) * layer.w2_weight_scale.squeeze(2)
-        seq_indexes = torch.arange(seqlen, device='jax').unsqueeze(1)
-        expert_outs = expert_outs[seq_indexes, expert_indices]
-        # out = torch.einsum("tai,ta -> ti", expert_outs, expert_weights)
-        out = call_jax(jax.lax.dot,
-                       expert_outs,
-                       expert_weights,
-                       dimension_numbers=(((1, ), (1, )), ((0, ), (0, ))),
-                       preferred_element_type=jnp.bfloat16.dtype)
+        x = jax_view(x)
+        w13_weight = jax_view(layer.w13_weight)
+        w2_weight = jax_view(layer.w2_weight)
+        w13_weight_scale = jax_view(layer.w13_weight_scale)
+        w2_weight_scale = jax_view(layer.w2_weight_scale)
+        gating_output = jax_view(router_logits)
+        out = torch_view(
+            fused_moe_func(
+                hidden_states=x,
+                w1=w13_weight,
+                w2=w2_weight,
+                w1_scale=w13_weight_scale,
+                w2_scale=w2_weight_scale,
+                w1_bias=None,
+                w2_bias=None,
+                gating_output=gating_output,
+                topk=layer.top_k,
+                renormalize=layer.renormalize,
+                mesh=self.mesh,
+                use_ep=layer.use_ep,
+                activation=layer.activation,
+            ))
         return out

tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Optional
 import jax

tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Optional
 import jax

tpu_inference/layers/vllm/quantization/fp8.py ADDED Viewed

@@ -0,0 +1,118 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+import jax
+import torch
+from jax.sharding import PartitionSpec
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import \
+    register_quantization_config
+from vllm.model_executor.layers.quantization.base_config import \
+    QuantizeMethodBase
+from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
+                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.utils.quant_utils import \
+    is_layer_skipped
+from tpu_inference.layers.common.quant_methods import FP8, get_tpu_quant_method
+from tpu_inference.layers.vllm.quantization.common import (
+    JaxCommonConfig, JaxCommonLinearConfig)
+from tpu_inference.layers.vllm.quantization.unquantized import \
+    VllmUnquantizedLinearMethod
+P = PartitionSpec
+logger = init_logger(__name__)
+@register_quantization_config(get_tpu_quant_method(FP8))
+class VllmFp8Config(Fp8Config, JaxCommonConfig):
+    @classmethod
+    def get_name(cls):
+        return FP8
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.bfloat16]
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[Union["LinearMethodBase", "QuantizeMethodBase"]]:
+        if isinstance(layer, LinearBase):
+            linear_config = self.get_linear_config(layer)
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return VllmUnquantizedLinearMethod(linear_config)
+            return VllmFp8LinearMethod(self, linear_config)
+        elif isinstance(layer, FusedMoE):
+            raise NotImplementedError(
+                "FP8 FusedMoE is currently not supported in torchax-jax")
+        return None
+class VllmFp8LinearMethod(Fp8LinearMethod):
+    def __init__(self, quant_config: VllmFp8Config,
+                 jax_config: JaxCommonLinearConfig):
+        super().__init__(quant_config)
+        self.jax_config = jax_config
+        self._configure_sharding()
+    def _configure_sharding(self) -> None:
+        raise NotImplementedError(
+            "Configure PartitionSpec for weight_sharding and scale_sharding "
+            "based on layer type (RowParallel/ColumnParallel)")
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        raise NotImplementedError(
+            "Convert layer.weight, layer.weight_scale, and optionally "
+            "layer.input_scale and layer.bias from torch tensors to JAX arrays "
+            "using torch_to_jax_param() with appropriate sharding")
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        with jax.named_scope(layer._get_name()):
+            if self.jax_config.fuse_matmuls:
+                out = self._apply_fused(layer, x, bias)
+            else:
+                out = self._apply_split(layer, x, bias)
+        return out
+    def _apply_fused(self,
+                     layer: torch.nn.Module,
+                     x: torch.Tensor,
+                     bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError(
+            "Implement single matmul for fused outputs: "
+            "quantize input to fp8, perform fp8 matmul with weight and scales, "
+            "dequantize output, and add bias if present")
+    def _apply_split(self,
+                     layer: torch.nn.Module,
+                     x: torch.Tensor,
+                     bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError(
+            "Implement separate matmuls per output partition: "
+            "split weight/scale by output_sizes, perform fp8 matmul for each, "
+            "concatenate results, and add bias if present")

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl