PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/awq.py CHANGED Viewed

@@ -1,11 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Optional, Union
 import jax
 import jax.numpy as jnp
 import torch
-from jax.sharding import NamedSharding, PartitionSpec
+from jax.sharding import PartitionSpec
+from torch.nn.parameter import Parameter
 from torchax.interop import jax_view, torch_view
-from vllm.logger import init_logger
+from torchax.ops.mappings import t2j
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization import \
@@ -14,24 +29,29 @@ from vllm.model_executor.layers.quantization.awq import (AWQConfig,
                                                          AWQLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizeMethodBase
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    is_layer_skipped, unpack_quantized_values_into_int32)
-from vllm.scalar_type import scalar_types
+from vllm.model_executor.layers.quantization.utils.quant_utils import \
+    is_layer_skipped
 from tpu_inference.layers.common.quant_methods import AWQ, get_tpu_quant_method
-from tpu_inference.layers.vllm.linear_common import (
-    slice_sharded_tensor_for_concatenation, torch_to_jax_param)
-from tpu_inference.layers.vllm.quantization.common import (
-    JaxCommonConfig, JaxCommonLinearConfig)
+from tpu_inference.layers.common.quantization import awq_u32_unpack_u4
+from tpu_inference.layers.common.utils import \
+    slice_sharded_tensor_for_concatenation
+from tpu_inference.layers.vllm.process_weights.linear_weights import (
+    LinearWeights, process_lienar_weights, shard_linear_weights,
+    to_parameter_list)
+from tpu_inference.layers.vllm.quantization.configs import (
+    VllmQuantConfig, VllmQuantLinearConfig)
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedLinearMethod
+from tpu_inference.logger import init_logger
 P = PartitionSpec
 logger = init_logger(__name__)
 @register_quantization_config(get_tpu_quant_method(AWQ))
-class VllmAWQConfig(AWQConfig, JaxCommonConfig):
+class VllmAWQConfig(AWQConfig, VllmQuantConfig):
     @classmethod
     def get_name(cls):
@@ -39,7 +59,7 @@ class VllmAWQConfig(AWQConfig, JaxCommonConfig):
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
         # NOTE: AWQ checkpoint was quantized with float16. But on TPUs, using
-        # bfloat16 is signifcantly preferred over foat16. This might lead to
+        # bfloat16 is significantly preferred over float16. This might lead to
         # some numeric output change.
         return [torch.bfloat16]
@@ -60,72 +80,79 @@ class VllmAWQConfig(AWQConfig, JaxCommonConfig):
 class VllmAWQLinearMethod(AWQLinearMethod):
     def __init__(self, quant_config: VllmAWQConfig,
-                 jax_config: JaxCommonLinearConfig):
+                 linear_config: VllmQuantLinearConfig):
         super().__init__(quant_config)
-        self.jax_config = jax_config
-        out_sharding, in_sharding = self.jax_config.weight_sharding[:]
-        self.jax_config.weight_sharding = P(in_sharding, None, out_sharding)
-        self.jax_config.scale_sharding = P(in_sharding, out_sharding)
+        self.linear_config = linear_config
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        qweight = layer.qweight
-        qweight = unpack_awq_weight(qweight, qweight.packed_dim)
-        group_size = self.quant_config.group_size
-        # Reshape so that each qweight[i] were quantized with same scales[i].
-        qweight = qweight.reshape((-1, group_size, layer.output_size))
-        qweight = torch_to_jax_param(qweight,
-                                     NamedSharding(
-                                         self.jax_config.mesh,
-                                         self.jax_config.weight_sharding),
-                                     self.jax_config.output_sizes,
-                                     self.jax_config.n_shards,
-                                     self.jax_config.fuse_matmuls,
-                                     dim=2,
-                                     jax_dtype=jnp.uint4)
+        assert layer.qweight.packed_dim == layer.qweight.ndim - 1
+        weight = t2j(layer.qweight, use_dlpack=False)
         delattr(layer, "qweight")
-        layer.qweight = qweight
-        qzeros = layer.qzeros
-        qzeros = unpack_awq_weight(qzeros, qzeros.packed_dim)
-        qzeros = torch_to_jax_param(qzeros,
-                                    NamedSharding(
-                                        self.jax_config.mesh,
-                                        self.jax_config.scale_sharding),
-                                    self.jax_config.output_sizes,
-                                    self.jax_config.n_shards,
-                                    self.jax_config.fuse_matmuls,
-                                    dim=1,
-                                    jax_dtype=jnp.uint4)
-        delattr(layer, "qzeros")
-        layer.qzeros = qzeros
-        scales = torch_to_jax_param(layer.scales,
-                                    NamedSharding(
-                                        self.jax_config.mesh,
-                                        self.jax_config.scale_sharding),
-                                    self.jax_config.output_sizes,
-                                    self.jax_config.n_shards,
-                                    self.jax_config.fuse_matmuls,
-                                    dim=1)
+        weight_scale = t2j(layer.scales, use_dlpack=False)
         delattr(layer, "scales")
-        layer.scales = scales
+        assert layer.qzeros.packed_dim == layer.qzeros.ndim - 1
+        zero_point = t2j(layer.qzeros, use_dlpack=False)
+        delattr(layer, "qzeros")
         if layer.bias is not None and not layer.skip_bias_add:
             if layer.return_bias:
                 logger.warning_once("Bias might return incorrect value.")
-            bias = torch_to_jax_param(
-                layer.bias,
-                NamedSharding(self.jax_config.mesh,
-                              self.jax_config.bias_sharding),
-                self.jax_config.output_sizes,
-                self.jax_config.n_shards,
-                self.jax_config.fuse_matmuls,
-            )
+            bias = t2j(layer.bias, use_dlpack=False)
             delattr(layer, "bias")
-            layer.bias = bias
+        else:
+            bias = None
+        @jax.jit
+        def process_awq_linear_weights(
+            weight: jax.Array,
+            weight_scale: jax.Array,
+            zero_point: jax.Array,
+            bias: jax.Array | None,
+        ) -> LinearWeights:
+            weight = awq_u32_unpack_u4(weight)
+            group_size = self.quant_config.group_size
+            weight = weight.reshape((-1, group_size, weight.shape[-1]))
+            zero_point = awq_u32_unpack_u4(zero_point)
+            return process_lienar_weights(
+                LinearWeights(
+                    weight=weight,
+                    weight_scale=weight_scale,
+                    zero_point=zero_point,
+                    bias=bias,
+                ),
+                fused=self.linear_config.fuse_matmuls,
+                output_sizes=self.linear_config.output_sizes,
+                reorder_size=self.linear_config.n_shards,
+                transposed=False,
+            )
+        weights = process_awq_linear_weights(weight, weight_scale, zero_point,
+                                             bias)
+        weights = torch_view(
+            shard_linear_weights(
+                weights,
+                mesh=self.linear_config.mesh,
+                weight_p_spec=self.linear_config.weight_sharding,
+                bias_p_spec=self.linear_config.bias_sharding,
+                transposed=False,
+            ))
+        if self.linear_config.fuse_matmuls:
+            layer.qweight = Parameter(weights.weight, requires_grad=False)
+            layer.scales = Parameter(weights.weight_scale, requires_grad=False)
+            layer.qzeros = Parameter(weights.zero_point, requires_grad=False)
+            if bias is not None:
+                layer.bias = Parameter(weights.bias, requires_grad=False)
+        else:
+            layer.qweight = to_parameter_list(weights.weight)
+            layer.scales = to_parameter_list(weights.weight_scale)
+            layer.qzeros = to_parameter_list(weights.zero_point)
+            if bias is not None:
+                layer.bias = to_parameter_list(weights.bias)
     def apply(self,
               layer: torch.nn.Module,
@@ -133,7 +160,7 @@ class VllmAWQLinearMethod(AWQLinearMethod):
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         with jax.named_scope(layer._get_name()):
-            if self.jax_config.fuse_matmuls:
+            if self.linear_config.fuse_matmuls:
                 out = self._apply_fused(layer, x, bias)
             else:
                 out = self._apply_split(layer, x, bias)
@@ -161,7 +188,7 @@ class VllmAWQLinearMethod(AWQLinearMethod):
             outs += bias.jax()
         outs = slice_sharded_tensor_for_concatenation(
-            outs, self.jax_config.output_sizes, self.jax_config.n_shards)
+            outs, self.linear_config.output_sizes, self.linear_config.n_shards)
         out = jnp.concatenate(outs, axis=-1)
         return torch_view(out)
@@ -192,16 +219,3 @@ class VllmAWQLinearMethod(AWQLinearMethod):
             outs.append(out)
         out = jnp.concatenate(outs, axis=-1)
         return torch_view(out)
-def unpack_awq_weight(weight: torch.Tensor, packed_dim: int):
-    weight = unpack_quantized_values_into_int32(weight, scalar_types.uint4,
-                                                packed_dim)
-    # AWQ packs 8 uint4 into 32-bits in this order: (0, 2, 4, 6, 1, 3, 5, 7).
-    # Following list maps the order used by AWQ into an ascending order.
-    reverse_awq_order = (0, 4, 1, 5, 2, 6, 3, 7)
-    orig_shape = weight.shape
-    weight = weight.reshape(orig_shape[:-1] + (-1, 8))
-    return weight[..., reverse_awq_order].reshape(orig_shape)

tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -1,9 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Optional
 import torch
 from jax.sharding import PartitionSpec
 from vllm.attention.layer import Attention
-from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization import \
@@ -18,22 +31,23 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
 from tpu_inference.layers.common.quant_methods import (COMPRESSED_TENSORS,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import \
-    VllmCompressedTensorsW8A8Fp8MoEMethod
+    VllmCompressedTensorsMoEMethod
 from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_fp8 import \
     VllmCompressedTensorsW8A8Fp8
 from tpu_inference.layers.vllm.quantization.compressed_tensors.schemes.compressed_tensors_w8a8_int8 import \
     VllmCompressedTensorsW8A8Int8
+from tpu_inference.layers.vllm.quantization.configs import VllmQuantConfig
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedConfig
+from tpu_inference.logger import init_logger
 P = PartitionSpec
 logger = init_logger(__name__)
 @register_quantization_config(get_tpu_quant_method(COMPRESSED_TENSORS))
-class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
+class VllmCompressedTensorsConfig(CompressedTensorsConfig, VllmQuantConfig):
     @classmethod
     def get_name(cls) -> str:
@@ -84,14 +98,14 @@ class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
             return VllmCompressedTensorsW8A8Fp8(
                 weight_quant=weight_quant,
                 is_static_input_scheme=is_static_input_scheme,
-                jax_config=linear_config,
+                linear_config=linear_config,
             )
         if self._is_dynamic_token_w8a8(weight_quant, input_quant):
             return VllmCompressedTensorsW8A8Int8(
                 strategy=weight_quant.strategy,
                 is_static_input_scheme=False,
                 input_symmetric=input_quant.symmetric,
-                jax_config=linear_config,
+                linear_config=linear_config,
             )
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
@@ -113,8 +127,9 @@ class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
             layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, FusedMoE):
-            return VllmCompressedTensorsW8A8Fp8MoEMethod(
-                self, layer.quant_config, self.mesh)
+            layer.moe_config = self.get_moe_config(layer)
+            return VllmCompressedTensorsMoEMethod.get_moe_method(
+                self, layer, layer_name=prefix)
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         return None

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl