PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (248) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +14 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +14 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +180 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +25 -8
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +14 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +20 -3
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +171 -163
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +20 -26
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +112 -69
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +85 -65
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +374 -194
tpu_inference/kernels/ragged_paged_attention/v3/util.py +13 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +22 -3
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +52 -27
tpu_inference/layers/jax/attention/gpt_oss_attention.py +19 -6
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +100 -455
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +19 -5
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +119 -132
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +38 -26
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +133 -220
tpu_inference/layers/vllm/quantization/unquantized.py +154 -253
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +37 -16
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +113 -124
tpu_inference/models/jax/gpt_oss.py +23 -7
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +85 -24
tpu_inference/models/jax/utils/weight_utils.py +32 -1
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +27 -29
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +69 -35
tpu_inference/runner/kv_cache.py +14 -0
tpu_inference/runner/kv_cache_manager.py +15 -2
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +30 -10
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +31 -30
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +23 -7
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +1 -1
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -208
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.12.0.dev20251213.dist-info/RECORD +0 -175
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} RENAMED Viewed

@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import functools
 import os
 from typing import TYPE_CHECKING, Callable, List
@@ -34,17 +35,43 @@ DEFAULT_NUM_TOKENS_FOR_MODEL_INPUTS = 512
 DEFAULT_MAX_NUM_SEQS_FOR_MODEL_INPUTS = 256
 DEFAULT_MAX_NUM_BLOCKS_PER_REQ = 16
-DEFAULT_DEEPSEEK_FP8_CONFIG = {
+DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG = {
     "qwix": {
         "use_abstract_model":
         True,
         "scale_dtype":
         "bfloat16",
         "rules": [
+            # Exclude router from quantization
             {
                 "module_path": ".*.custom_module.router.*",
                 "weight_qtype": None,
             },
+            # Avoid the combine expert ops
+            {
+                "module_path": ".*combine_experts.*",
+                "weight_qtype": None,
+            },
+            # Attention layers: keep FP8 for weights and activations
+            {
+                "module_path": ".*.attn.*",
+                "weight_qtype": "float8_e4m3fn",
+                "act_qtype": "float8_e4m3fn",
+            },
+            # MoE experts: use FP4 for expert weights
+            {
+                "module_path": ".*.custom_module.*",
+                "weight_qtype": "float4_e2m1fn",
+                "act_qtype": "float8_e4m3fn",
+                "tile_size": 256,
+            },
+            # Shared experts: also FP4
+            {
+                "module_path": ".*.shared_experts.*",
+                "weight_qtype": "float4_e2m1fn",
+                "act_qtype": "float8_e4m3fn",
+                "tile_size": 256,
+            },
             {
                 "module_path": ".*",
                 "weight_qtype": "float8_e4m3fn",
@@ -398,8 +425,7 @@ def apply_qwix_on_abstract_model(vllm_config: "VllmConfig") -> bool:
 def get_default_qwix_quantization_config(
-        model_type: str, quant_method: str,
-        skip_quantization: bool) -> dict | None:
+        hf_config: dict, skip_quantization: bool) -> dict | None:
     """
     Some models are pre-quantized and in those cases, we want to return a default set of
     Qwix quantization rules (instead of forcing the user to pass in a quantization config each time).
@@ -417,9 +443,42 @@ def get_default_qwix_quantization_config(
     """
     if skip_quantization:
         return None
-    # TODO (jacobplatin): remove this so that we can support various quantization types
+    model_type = hf_config.model_type.lower() if hasattr(
+        hf_config, "model_type") else None
+    quant_method = hf_config.quantization_config["quant_method"] if hasattr(
+        hf_config, "quantization_config") else None
+    # TODO (jacobplatin): remove this so that we can support various quantization types + make
+    # more flexible
+    # NOTE (jacobplatin): we'll default to mixed FP8 (attention) + FP4 (MoE experts)
+    # for DeepSeek
     if model_type == "deepseek_v3" and quant_method == "fp8":
-        return DEFAULT_DEEPSEEK_FP8_CONFIG
+        config = copy.deepcopy(DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG)
+        # Dynamically fetch block size from HF config if available
+        # Config fmt: 'weight_block_size': [1, 512] -> we want the 2nd dim for tile_size
+        # NOTE: if the checkpoint is not 1D subchannel, we will throw an error
+        hf_quant_config = hf_config.quantization_config
+        assert "weight_block_size" in hf_quant_config, "Expected weight_block_size in quantization_config"
+        block_size = hf_quant_config["weight_block_size"]
+        if isinstance(block_size, (list, tuple)) and len(block_size) == 2:
+            assert block_size[
+                0] == 1, f"Expected first dimension to be 1 (unchanneled), but got {block_size[0]}! If you are trying to run quantized DeepSeek, we currently only support 1D-subchannel quantization and those models can be found here: https://huggingface.co/collections/jrplatin/deepseek-r1-1d-subchannel"
+            tile_size = block_size[1]
+            assert tile_size > 1, f"Expected tile_size > 1 for DeepSeek, but got {tile_size}"
+            logger.info(
+                f"Detected DeepSeek tile_size from config: {tile_size}")
+            # Update tile_size in the rules, since we might not always use a 1D subchannel size of
+            # 256
+            for rule in config["qwix"]["rules"]:
+                if "tile_size" in rule:
+                    rule["tile_size"] = tile_size
+        else:
+            raise ValueError(
+                f"Invalid weight_block_size config: {block_size}, expected a list/tuple of length 2"
+            )
+        return config
     elif model_type == "llama4" and quant_method == "compressed-tensors":
         return DEFAULT_LLAMA4_FP8_CONFIG
     # MXFP4 (GPT-OSS): provide a default configuration to quantize MoE experts via Qwix
@@ -438,14 +497,10 @@ def update_vllm_config_for_qwix_quantization(vllm_config: "VllmConfig"):
     # Qwix quantization config accordingly
     # NOTE: if a Qwix config is provided (via the`additional_config`), we'll
     # use that instead
-    model_type = vllm_config.model_config.hf_config.model_type.lower(
-    ) if hasattr(vllm_config.model_config.hf_config, "model_type") else None
-    quant_method = vllm_config.model_config.hf_config.quantization_config[
-        "quant_method"] if hasattr(vllm_config.model_config.hf_config,
-                                   "quantization_config") else None
+    hf_config = vllm_config.model_config.hf_config
     default_quantization_config = get_default_qwix_quantization_config(
-        model_type, quant_method,
-        vllm_config.additional_config.get("skip_quantization", False))
+        hf_config, vllm_config.additional_config.get("skip_quantization",
+                                                     False))
     maybe_existing_quantization_config = vllm_config.additional_config.get(
         "quantization")
@@ -502,7 +557,14 @@ def get_random_sharded_array(key: PRNGKey, mesh: Mesh, param: nnx.Param,
         maxval = jnp.array(jnp.iinfo(dtype).max, dtype=dtype)
         weight = jax.random.randint(key, param_shape, minval, maxval, dtype)
     else:
-        weight = jax.random.normal(key, param_shape, dtype)
+        # NOTE: _uniform() in random.py does not accept float4_e2m1fn
+        # Error: "TypeError: uniform only accepts 8-, 16-, 32-, or 64-bit dtypesgot float4_e2m1fn."
+        # Workaround: call function with dtype jnp.float8_e4m3fn and cast back to float4_e2m1fn
+        if dtype != "float4_e2m1fn":
+            weight = jax.random.normal(key, param_shape, dtype)
+        else:
+            weight = jax.random.normal(key, param_shape,
+                                       jnp.float8_e4m3fn).astype(dtype)
     def get_slice(index):
         return weight[index]
@@ -537,18 +599,16 @@ def load_random_weights_into_qwix_abstract_model(rng: PRNGKey,
     logger.info("Initializing Qwix-quantized model with random weights...")
     # TODO (jacobplatin): clean up this logic
     scale_dtype = model.weight_loader.scale_dtype
-    scale_shape_map = model.weight_loader.scale_shap_map_for_random_weight_loading if hasattr(
+    scale_shape_map = model.weight_loader.scale_shape_map_for_random_weight_loading if hasattr(
         model.weight_loader,
-        'scale_shap_map_for_random_weight_loading') else {}
+        'scale_shape_map_for_random_weight_loading') else {}
     quantization_block_sizes = quantization_config["weight_block_size"]
     assert len(
         quantization_block_sizes
     ) == 2, f"Expected only 2 quantization block sizes but got {quantization_block_sizes}"
-    quantization_block_size_n, _ = quantization_block_sizes[
-        0], quantization_block_sizes[1]
     # Iterate through all variables and initialize them
-    prev_param_shape = None
     for path, param in nnx.iter_graph(model):
         if not isinstance(param, nnx.Variable):
             continue
@@ -558,16 +618,17 @@ def load_random_weights_into_qwix_abstract_model(rng: PRNGKey,
         is_qwix_scale = (path[-1] == 'scale' and path[-2] == "array")
         param_dtype = scale_dtype if is_qwix_scale else param.value.dtype
         param_shape = param.value.shape
-        # TODO (jacobplatin): clean this up
         if is_qwix_scale:
-            param_shape = scale_shape_map.get(
-                path[3],
-                tuple(dim // quantization_block_size_n
-                      for dim in prev_param_shape))
+            key = f"{path[2]}.{path[3]}"
+            if key in scale_shape_map:
+                param_shape = scale_shape_map[key]
+            else:
+                raise ValueError(
+                    f"Scale shape for {key} not found in scale_shape_map.")
         param.value = get_random_sharded_array(
             rng, mesh, param, param_shape, param_dtype,
             ".".join([str(x) for x in path]))
-        prev_param_shape = param_shape
     # Handles the DeepSeek case, where this needs to be called to make the cache weights
     # concrete

tpu_inference/models/jax/utils/weight_utils.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Utilities for downloading model weights from HuggingFace."""
 import functools
@@ -281,7 +294,8 @@ def _load_and_shard_weight(vllm_config,
                            hf_key: str,
                            hf_weight: jax.Array,
                            keep_original_dtype_keys_regex: list[str]
-                           | None = None):
+                           | None = None,
+                           pp_missing_layers: list[str] | None = None):
     name_map = metadata_map.name_map
     reshape_keys = metadata_map.reshape_map
     bias_reshape_keys = metadata_map.bias_reshape_map
@@ -337,6 +351,10 @@ def _load_and_shard_weight(vllm_config,
             return
         model_key = name_map.get(hf_key, hf_key)
+    if pp_missing_layers and _is_pp_missing_layer(hf_key, pp_missing_layers):
+        logger.warning(
+            f"Skip loading {hf_key} as it doesn't belong to this PP stage.")
+        return
     model_weight, model_sharding = get_param_and_sharding(
         params, shardings, model_key)
@@ -400,6 +418,14 @@ def _load_and_shard_weight(vllm_config,
     model_weight.value = shard(hf_weight, spec)
+def _is_pp_missing_layer(hf_key: str, pp_missing_layers: list[str]) -> bool:
+    has_digit = any(char.isdigit() for char in hf_key)
+    # add the suffix after digits to avoid it matches "layers.10" with "layers.1"
+    suffix = "." if has_digit else ""
+    return any(f'{pp_missing_layer}{suffix}' in hf_key
+               for pp_missing_layer in pp_missing_layers)
 def _load_hf_weights_on_thread(
     vllm_config: VllmConfig,
     params: nnx.State,
@@ -408,6 +434,7 @@ def _load_hf_weights_on_thread(
     weights_file: str,
     filter_regex: Optional[str] = None,
     keep_original_dtype_keys_regex: Optional[list[str]] = None,
+    pp_missing_layers: list[str] | None = None,
 ):
     """Loads weights from a single weights file."""
     try:
@@ -426,6 +453,7 @@ def _load_hf_weights_on_thread(
             hf_key,
             hf_weight,
             keep_original_dtype_keys_regex,
+            pp_missing_layers,
         )
@@ -437,6 +465,7 @@ def load_hf_weights(
     filter_regex: Optional[str] = None,
     is_draft_model: bool = False,
     keep_original_dtype_keys_regex: Optional[list[str]] = None,
+    pp_missing_layers: list[str] | None = None,
 ):
     """Load weights into a JAX model from either an iterator or files."""
     params = nnx.state(model)
@@ -467,6 +496,7 @@ def load_hf_weights(
                 hf_key,
                 hf_weight_jax,
                 keep_original_dtype_keys_regex,
+                pp_missing_layers=pp_missing_layers,
             )
     else:
         # File-based path (multi-threaded)
@@ -494,6 +524,7 @@ def load_hf_weights(
                     filter_regex=filter_regex,
                     keep_original_dtype_keys_regex=
                     keep_original_dtype_keys_regex,
+                    pp_missing_layers=pp_missing_layers,
                 ) for weights_file in weights_files
             ]
             for future in futures:

tpu_inference/models/vllm/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/models/vllm/vllm_model_wrapper.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 import functools
 from collections.abc import Sequence
@@ -23,8 +37,10 @@ from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.sequence import IntermediateTensors
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.vllm.process_weights.cleanup_sharding import \
+    shard_model_to_tpu
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
-from tpu_inference.layers.vllm.sharding import shard_model_to_tpu
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.jax_intermediate_tensor import \
     JaxIntermediateTensors
@@ -197,7 +213,7 @@ class VllmModelWrapper:
                     kwargs={
                         "input_ids": torch_view(input_ids),
                         "positions": torch_view(input_positions),
-                        "intermediate_tensors": None,
+                        "intermediate_tensors": intermediate_tensors,
                         "inputs_embeds": None,
                     },
                     tie_weights=False,
@@ -220,8 +236,10 @@ class VllmModelWrapper:
         @functools.partial(
             jax.jit,
-            out_shardings=(NamedSharding(self.mesh,
-                                         PartitionSpec("data", "model"))),
+            out_shardings=(NamedSharding(
+                self.mesh,
+                PartitionSpec(ShardingAxisName.MLP_DATA,
+                              ShardingAxisName.MLP_TENSOR))),
         )
         def compute_logits_func(
             params_and_buffers: Any,

tpu_inference/models/vllm/vllm_model_wrapper_context.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Dict, List, Optional

tpu_inference/platforms/__init__.py CHANGED Viewed

@@ -1,2 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # ruff: noqa
 from tpu_inference.platforms.tpu_platform import TpuPlatform

tpu_inference/platforms/tpu_platform.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
+from typing import TYPE_CHECKING, Optional, Tuple, Union, cast
 import jax.numpy as jnp
 import torch
@@ -15,6 +15,7 @@ from tpu_inference.logger import init_logger
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.attention.selector import AttentionSelectorConfig
     from vllm.config import BlockSize, ModelConfig, VllmConfig
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams, SamplingType
@@ -51,11 +52,10 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: "AttentionBackendEnum",
-                             head_size: int, dtype: jnp.dtype,
-                             kv_cache_dtype: Optional[str], block_size: int,
-                             use_mla: bool, has_sink: bool, use_sparse: bool,
-                             use_mm_prefix: bool, attn_type: Any) -> str:
+                             attn_selector_config: "AttentionSelectorConfig",
+                             **kwargs) -> str:
         from vllm.attention.backends.registry import AttentionBackendEnum
         if selected_backend != AttentionBackendEnum.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
@@ -145,17 +145,20 @@ class TpuPlatform(Platform):
             compilation_config.backend = "openxla"
         # TODO(cuiq): remove this dependency.
-        from vllm.v1.attention.backends.pallas import PallasAttentionBackend
-        cache_config.block_size = PallasAttentionBackend.get_page_size(
-            vllm_config)  # type: ignore[assignment]
-        min_page_size = PallasAttentionBackend.get_min_page_size(vllm_config)
-        if min_page_size > cache_config.block_size:
-            logger.warning(
-                "Increase the page size from %s to %s to avoid SMEM OOM",
-                cache_config.block_size,
-                min_page_size,
-            )
-            cache_config.block_size = min_page_size  # type: ignore[assignment]
+        if vllm_config.model_config:
+            from vllm.v1.attention.backends.pallas import \
+                PallasAttentionBackend
+            cache_config.block_size = PallasAttentionBackend.get_page_size(
+                vllm_config)  # type: ignore[assignment]
+            min_page_size = PallasAttentionBackend.get_min_page_size(
+                vllm_config)
+            if min_page_size > cache_config.block_size:
+                logger.warning(
+                    "Increase the page size from %s to %s to avoid SMEM OOM",
+                    cache_config.block_size,
+                    min_page_size,
+                )
+                cache_config.block_size = min_page_size  # type: ignore[assignment]
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
@@ -165,12 +168,12 @@ class TpuPlatform(Platform):
         multihost_backend = envs.TPU_MULTIHOST_BACKEND
         if not multihost_backend:  # Single host
             if parallel_config.pipeline_parallel_size == 1:
-                logger.info("Force using UniProcExecutor for JAX on \
-                        single host without pipeline parallelism.")
+                logger.info("Force using UniProcExecutor for JAX on "
+                            "single host without pipeline parallelism.")
                 parallel_config.distributed_executor_backend = "uni"
             else:
-                logger.info("Force using MultiprocExecutor for JAX on \
-                        single host with pipeline parallelism.")
+                logger.info("Force using MultiprocExecutor for JAX on "
+                            "single host with pipeline parallelism.")
                 parallel_config.distributed_executor_backend = "mp"
         elif multihost_backend == "ray":
             from tpu_inference.executors.ray_distributed_executor import \
@@ -186,20 +189,15 @@ class TpuPlatform(Platform):
         if scheduler_config.is_multimodal_model and not \
             scheduler_config.disable_chunked_mm_input:
-            logger.warning("TPU does not support running Multimodal models"\
-            " without setting `--disable_chunked_mm_input`. " \
-            "Forcing --disable_chunked_mm_input.")
+            logger.warning("TPU does not support running Multimodal models"
+                           " without setting `--disable_chunked_mm_input`. "
+                           "Forcing --disable_chunked_mm_input.")
             scheduler_config.disable_chunked_mm_input = True
         kv_transfer_config = vllm_config.kv_transfer_config
         if kv_transfer_config is not None:
             assert kv_transfer_config.kv_connector == "TPUConnector"
-        # Late initialization to avoid circular import
-        from tpu_inference.models.jax.utils.quantization.quantization_utils import \
-            update_vllm_config_for_qwix_quantization
-        update_vllm_config_for_qwix_quantization(vllm_config)
+        # Late initialization to avoid circular import.
         from tpu_inference.core.sched.dp_scheduler import \
             update_vllm_config_for_dp_scheduler
         update_vllm_config_for_dp_scheduler(vllm_config)

tpu_inference/runner/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/runner/compilation_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import time
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
@@ -32,6 +46,8 @@ class CompilationManager:
     def __init__(self, runner: "TPUModelRunner"):
         self.runner = runner
+        self._sampling_precompiled = False
+        self._gather_logprobs_precompiled = False
         if not vllm_envs.VLLM_DISABLE_COMPILE_CACHE:
             logger.info("Enabling JAX compile cache.")
             jax.config.update("jax_compilation_cache_dir",
@@ -86,9 +102,13 @@ class CompilationManager:
                 return
             self._precompile_select_from_array()
             self._precompile_compute_logits()
+            # Skip sampling if already precompiled before KV cache allocation
+            if not self._sampling_precompiled:
+                self._precompile_sampling()
             self._precompile_disagg_utils()
-            self._precompile_sampling()
-            self._precompile_gather_logprobs()
+            # Skip gather_logprobs if already precompiled before KV cache allocation
+            if not self._gather_logprobs_precompiled:
+                self._precompile_gather_logprobs()
             self._precompile_structured_decoding()
             if self.runner.speculative_config:
                 self._precompile_speculative_decoding()
@@ -107,7 +127,7 @@ class CompilationManager:
             self._run_compilation(
                 "input_embeddings_merger",
-                self.runner.get_input_embeddings_fn,
+                self.runner.embed_input_ids_fn,
                 self.runner.state,
                 dummy_input_ids,
                 dummy_multimodal_embeddings,
@@ -116,7 +136,7 @@ class CompilationManager:
             self._run_compilation(
                 "input_embeddings_merger_text_only",
-                self.runner.get_input_embeddings_fn,
+                self.runner.embed_input_ids_fn,
                 self.runner.state,
                 dummy_input_ids,
                 None,
@@ -475,35 +495,39 @@ class CompilationManager:
             logits = self._create_dummy_tensor((num_reqs, hsize), jnp.bfloat16,
                                                logits_sharding)
             for do_sampling in (True, False):
-                if do_sampling:
-                    temperature = np.full((num_reqs, ), 0.7, dtype=np.float32)
-                    top_k = np.full((num_reqs, ), 20, dtype=np.int32)
-                    top_p = np.full((num_reqs, ), 0.8, dtype=np.float32)
-                    (temperature, top_k,
-                     top_p) = device_array(self.runner.mesh,
-                                           (temperature, top_k, top_p),
-                                           sharding=sampling_metadata_sharding)
-                else:
-                    temperature = None
-                    top_k = None
-                    top_p = None
-                sampling_metadata = TPUSupportedSamplingMetadata(
-                    temperature=temperature,
-                    top_k=top_k,
-                    top_p=top_p,
-                    do_sampling=do_sampling,
-                )
-                self._run_compilation(
-                    f"worker{self.runner.rank} sample",
-                    sample,
-                    self.runner.rng_params_for_sampling,
-                    self.runner.mesh,
-                    logits,
-                    sampling_metadata,
-                    num_reqs=num_reqs,
-                    do_sampling=do_sampling,
-                )
+                for logprobs in (True, False):
+                    if do_sampling:
+                        temperature = np.full((num_reqs, ),
+                                              0.7,
+                                              dtype=np.float32)
+                        top_k = np.full((num_reqs, ), 20, dtype=np.int32)
+                        top_p = np.full((num_reqs, ), 0.8, dtype=np.float32)
+                        (temperature, top_k, top_p) = device_array(
+                            self.runner.mesh, (temperature, top_k, top_p),
+                            sharding=sampling_metadata_sharding)
+                    else:
+                        temperature = None
+                        top_k = None
+                        top_p = None
+                    sampling_metadata = TPUSupportedSamplingMetadata(
+                        temperature=temperature,
+                        top_k=top_k,
+                        top_p=top_p,
+                        do_sampling=do_sampling,
+                        logprobs=logprobs)
+                    self._run_compilation(
+                        f"worker{self.runner.rank} sample",
+                        sample,
+                        self.runner.rng_params_for_sampling,
+                        self.runner.mesh,
+                        logits,
+                        sampling_metadata,
+                        num_reqs=num_reqs,
+                        do_sampling=do_sampling,
+                    )
+        self._sampling_precompiled = True
     def _precompile_disagg_utils(self) -> None:
         if not is_disagg_enabled():
@@ -533,8 +557,16 @@ class CompilationManager:
         logger.info("Compiling gather_logprobs with different input shapes.")
         hsize = self.runner.model_config.get_vocab_size()
         for num_reqs in self.runner.num_reqs_paddings:
-            logits = self._create_dummy_tensor((num_reqs, hsize), jnp.bfloat16)
-            token_ids = self._create_dummy_tensor((num_reqs, ), jnp.int32)
+            logits_sharding = NamedSharding(
+                self.runner.mesh,
+                PartitionSpec(ShardingAxisName.MLP_DATA,
+                              ShardingAxisName.MLP_TENSOR))
+            token_ids_sharding = NamedSharding(
+                self.runner.mesh, PartitionSpec(ShardingAxisName.MLP_DATA, ))
+            logits = self._create_dummy_tensor((num_reqs, hsize), jnp.bfloat16,
+                                               logits_sharding)
+            token_ids = self._create_dummy_tensor((num_reqs, ), jnp.int32,
+                                                  token_ids_sharding)
             self._run_compilation(
                 f"worker{self.runner.rank} gather_logprobs",
                 self.runner._compute_and_gather_logprobs,
@@ -544,6 +576,8 @@ class CompilationManager:
                 num_reqs=num_reqs,
             )
+        self._gather_logprobs_precompiled = True
     def _precompile_speculative_decoding(self) -> None:
         logger.info(
             "Compiling speculative_decoding with different input shapes.")

tpu-inference 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl