PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/models/jax/deepseek_v3.py CHANGED Viewed

@@ -1,3 +1,18 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
 import re
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
@@ -13,6 +28,8 @@ from torchax.ops.mappings import j2t_dtype
 from vllm.config import VllmConfig
 from tpu_inference import utils
+from tpu_inference.layers.common.quantization import u8_unpack_e2m1
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.attention.attention import AttentionMetadata
 from tpu_inference.layers.jax.attention.deepseek_v3_attention import MLA
 from tpu_inference.layers.jax.constants import KVCacheType
@@ -23,10 +40,8 @@ from tpu_inference.layers.jax.moe.moe import MoE
 from tpu_inference.layers.jax.transformer_block import (
     SharedExpertsTransformerBlock, TransformerBlock)
 from tpu_inference.logger import init_logger
-from tpu_inference.models.jax.utils.quantization.quantization_utils import \
-    get_quant_dtype_from_qwix_config
 from tpu_inference.models.jax.utils.weight_utils import (
-    get_param, model_weights_generator, print_param_info, reshape_params)
+    get_param, model_weights_generator, print_param_info)
 logger = init_logger(__name__)
@@ -69,6 +84,9 @@ class DeepSeekV3(nnx.Module):
         hidden_act: str = "silu"
         rms_norm_eps: float = 1e-06
         first_k_dense_replace: int = 3  # replace the first few MOE layers to dense layer.
+        self.use_mla_kernel: bool = self.vllm_config.model_config.use_mla
+        logger.info(f"Is using MLA kernel in DeepSeek: {self.use_mla_kernel}")
         num_shared_experts = 1
         rope_theta = 10000
@@ -114,19 +132,30 @@ class DeepSeekV3(nnx.Module):
             qk_rope_head_dim=qk_rope_head_dim,
             v_head_dim=v_head_dim,
             num_local_experts=num_local_experts,
-            model_dtype=dtype)
+            model_dtype=dtype,
+            use_mla_kernel=self.use_mla_kernel)
         self.embedder = Embedder(vocab_size=vocab_size,
                                  hidden_size=hidden_size,
                                  dtype=dtype,
                                  rngs=self.rng,
-                                 vd_sharding=(('data', 'expert', 'model'),
+                                 vd_sharding=(ShardingAxisName.MLP_TENSOR,
                                               None),
                                  random_init=self.random_init)
         self.layers = []
         def _create_mla() -> MLA:
+            if self.use_mla_kernel:
+                query_tnh_spec = P(ShardingAxisName.MLP_TENSOR, None, None)
+                keyvalue_skh_spec = P(ShardingAxisName.MLP_TENSOR, None)
+                attn_o_tnh_spec = P(ShardingAxisName.MLP_TENSOR, None, None)
+            else:
+                query_tnh_spec = P(None, ShardingAxisName.MLP_TENSOR, None)
+                keyvalue_skh_spec = P(None, ShardingAxisName.MLP_TENSOR, None)
+                attn_o_tnh_spec = P(None, ShardingAxisName.MLP_TENSOR, None)
             return MLA(
                 rope_theta=rope_theta,
                 rope_scaling=rope_scaling,
@@ -137,10 +166,12 @@ class DeepSeekV3(nnx.Module):
                 rms_norm_eps=rms_norm_eps,
                 v_head_dim=v_head_dim,
                 mesh=self.mesh,
+                use_mla_kernel=self.use_mla_kernel,
                 random_init=self.random_init,
                 hidden_size=hidden_size,
                 num_attention_heads=num_attention_heads,
-                num_key_value_heads=num_key_value_heads,
+                num_key_value_heads=1
+                if self.use_mla_kernel else num_key_value_heads,
                 head_dim=v_head_dim,  # MLA uses v_head_dim as head_dim
                 dtype=dtype,
                 # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
@@ -148,14 +179,15 @@ class DeepSeekV3(nnx.Module):
                 rngs=self.rng,
                 activation_attention_td=(None, None),
                 activation_q_td=(None, None),
-                query_tnh=P(None, 'model', None),
-                keyvalue_skh=P(None, 'model', None),
+                query_tnh=query_tnh_spec,
+                keyvalue_skh=keyvalue_skh_spec,
                 activation_attention_out_td=(None, None),
-                attn_o_tnh=P(None, 'model', None),
-                q_da_sharding=(None, 'model'),
-                anh_sharding=(None, 'model', None),
-                kv_da_sharding=(None, 'model'),
-                nhd_sharding=('model', None, None))
+                attn_o_tnh=attn_o_tnh_spec,
+                q_da_sharding=(None, ShardingAxisName.VOCAB),
+                ap_sharding=(None, ShardingAxisName.MLP_TENSOR),
+                anh_sharding=(None, ShardingAxisName.MLP_TENSOR, None),
+                kv_da_sharding=(None, ShardingAxisName.VOCAB),
+                rd_sharding=(ShardingAxisName.MLP_TENSOR, None))
         for i in range(first_k_dense_replace):
             block = TransformerBlock(
@@ -176,14 +208,15 @@ class DeepSeekV3(nnx.Module):
                     rngs=self.rng,
                 ),
                 attn=_create_mla(),
-                custom_module=DenseFFW(dtype=dtype,
-                                       hidden_act=hidden_act,
-                                       hidden_size=hidden_size,
-                                       intermediate_size=ffw_intermediate_size,
-                                       rngs=self.rng,
-                                       df_sharding=(None, ('model', 'expert')),
-                                       fd_sharding=(('model', 'expert'), None),
-                                       random_init=self.random_init))
+                custom_module=DenseFFW(
+                    dtype=dtype,
+                    hidden_act=hidden_act,
+                    hidden_size=hidden_size,
+                    intermediate_size=ffw_intermediate_size,
+                    rngs=self.rng,
+                    df_sharding=(None, ShardingAxisName.MLP_TENSOR),
+                    fd_sharding=(ShardingAxisName.MLP_TENSOR, None),
+                    random_init=self.random_init))
             self.layers.append(block)
@@ -200,9 +233,9 @@ class DeepSeekV3(nnx.Module):
                 rngs=self.rng,
                 routed_scaling_factor=2.5,
                 dtype=dtype,
-                activation_ffw_td=('data', None),
-                ed_sharding=('model', None),
-                e_sharding=('model', ))
+                activation_ffw_td=(ShardingAxisName.MLP_DATA, None),
+                ed_sharding=(ShardingAxisName.MLP_TENSOR, None),
+                e_sharding=(ShardingAxisName.MLP_TENSOR, ))
             if self.sparse_matmul:
                 # TODO: orginize the SparseMoE and DenseMoE better given they share most interfaces
                 custom_module = SparseMoE(
@@ -216,10 +249,10 @@ class DeepSeekV3(nnx.Module):
                     hidden_act=hidden_act,
                     rngs=self.rng,
                     random_init=self.random_init,
-                    activation_ffw_td=('data', None),
-                    activation_ffw_ted=('data', None, None),
-                    edf_sharding=('model', None, None),
-                    efd_sharding=('model', None, None),
+                    activation_ffw_td=(ShardingAxisName.MLP_TENSOR, None),
+                    activation_ffw_ted=(ShardingAxisName.MLP_DATA, None, None),
+                    edf_sharding=(ShardingAxisName.MLP_TENSOR, None, None),
+                    efd_sharding=(ShardingAxisName.MLP_TENSOR, None, None),
                     quantized_dtype=self.weight_loader.quant_dtype
                     if self.weight_loader.is_model_quantized else None,
                     router=router) if is_moe_layer else DenseFFW(
@@ -229,8 +262,8 @@ class DeepSeekV3(nnx.Module):
                         intermediate_size=ffw_intermediate_size,
                         rngs=self.rng,
                         random_init=self.random_init,
-                        df_sharding=(None, ('model', 'expert')),
-                        fd_sharding=(('model', 'expert'), None))
+                        df_sharding=(None, ShardingAxisName.MLP_TENSOR),
+                        fd_sharding=(ShardingAxisName.MLP_TENSOR, None))
             else:
                 custom_module = MoE(
                     dtype=dtype,
@@ -241,10 +274,10 @@ class DeepSeekV3(nnx.Module):
                     hidden_act=hidden_act,
                     rngs=self.rng,
                     random_init=self.random_init,
-                    activation_ffw_td=('data', None),
-                    activation_ffw_ted=('data', None, None),
-                    edf_sharding=('model', None, None),
-                    efd_sharding=('model', None, None),
+                    activation_ffw_td=(ShardingAxisName.MLP_DATA, None),
+                    activation_ffw_ted=(ShardingAxisName.MLP_DATA, None, None),
+                    edf_sharding=(ShardingAxisName.MLP_TENSOR, None, None),
+                    efd_sharding=(ShardingAxisName.MLP_TENSOR, None, None),
                     router=router) if is_moe_layer else DenseFFW(
                         dtype=dtype,
                         hidden_act=hidden_act,
@@ -252,18 +285,18 @@ class DeepSeekV3(nnx.Module):
                         intermediate_size=ffw_intermediate_size,
                         rngs=self.rng,
                         random_init=self.random_init,
-                        df_sharding=(None, ('model', 'expert')),
-                        fd_sharding=(('model', 'expert'), None))
-            shared_experts = DenseFFW(dtype=dtype,
-                                      hidden_act=hidden_act,
-                                      hidden_size=hidden_size,
-                                      intermediate_size=num_shared_experts *
-                                      moe_intermediate_size,
-                                      rngs=self.rng,
-                                      random_init=self.random_init,
-                                      df_sharding=(None, ('model', 'expert')),
-                                      fd_sharding=(('model', 'expert'), None))
+                        df_sharding=(None, ShardingAxisName.MLP_TENSOR),
+                        fd_sharding=(ShardingAxisName.MLP_TENSOR, None))
+            shared_experts = DenseFFW(
+                dtype=dtype,
+                hidden_act=hidden_act,
+                hidden_size=hidden_size,
+                intermediate_size=num_shared_experts * moe_intermediate_size,
+                rngs=self.rng,
+                random_init=self.random_init,
+                df_sharding=(None, ShardingAxisName.MLP_TENSOR),
+                fd_sharding=(ShardingAxisName.MLP_TENSOR, None))
             pre_attention_norm = RMSNorm(
                 dims=hidden_size,
@@ -304,10 +337,28 @@ class DeepSeekV3(nnx.Module):
                               hidden_size=hidden_size,
                               dtype=dtype,
                               rngs=self.rng,
-                              vd_sharding=(('data', 'expert', 'model'), None),
-                              dv_sharding=(None, ('data', 'expert', 'model')),
+                              vd_sharding=(ShardingAxisName.MLP_TENSOR, None),
+                              dv_sharding=(None, ShardingAxisName.MLP_TENSOR),
                               random_init=self.random_init)
+        if os.environ.get("VLLM_LOGGING_LEVEL", "").upper() == "DEBUG":
+            self._print_model_architecture()
+    def _print_model_architecture(self):
+        num_display_layers = 5
+        logger.debug("### Embedding ###")
+        nnx.display(self.embedder)
+        logger.debug(f"\n### First {num_display_layers} Layers ###")
+        # Loop through the slice and display each layer
+        for i, layer in enumerate(self.layers[:num_display_layers]):
+            logger.debug(f"\n--- Layer {i} ---")
+            nnx.display(layer)
+        logger.debug("\n### LM Head ###")
+        nnx.display(self.lm_head)
     # For compatibility with flax.
     def apply(self, variables, *args, **kwargs):
         return self.__call__(*args, **kwargs)
@@ -352,10 +403,19 @@ class DeepSeekV3(nnx.Module):
 @dataclass
 class DeepSeekV3WeightLoader:
-    def __init__(self, vllm_config: VllmConfig, num_layers, hidden_size,
-                 q_lora_rank, kv_lora_rank, attn_heads, qk_nope_head_dim,
-                 qk_rope_head_dim, v_head_dim, num_local_experts, model_dtype):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 num_layers,
+                 hidden_size,
+                 q_lora_rank,
+                 kv_lora_rank,
+                 attn_heads,
+                 qk_nope_head_dim,
+                 qk_rope_head_dim,
+                 v_head_dim,
+                 num_local_experts,
+                 model_dtype,
+                 use_mla_kernel=False):
         self.num_layers = num_layers
         self.names_and_weights_generator = model_weights_generator(
             model_name_or_path=vllm_config.model_config.model,
@@ -364,7 +424,12 @@ class DeepSeekV3WeightLoader:
         self.is_verbose = vllm_config.additional_config.get(
             "is_verbose", None) is not None
         self.num_routed_experts = num_local_experts
+        self.attn_heads = attn_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        self.kv_lora_rank = kv_lora_rank
         self.model_dtype = model_dtype
+        self.use_mla_kernel = use_mla_kernel
         self._transpose_map = {
             # dense mlp
@@ -373,10 +438,12 @@ class DeepSeekV3WeightLoader:
             r"mlp\.up_proj": (1, 0),
             # mla
             r"q_a_proj": (1, 0),
-            r"q_b_proj": (2, 0, 1),
+            r"q_b_proj": (1, 0),
             r"kv_a_proj_with_mqa": (1, 0),
-            r"kv_b_proj": (2, 0, 1),
-            r"o_proj": (1, 2, 0),
+            r"kv_b_proj": (1, 0),
+            r"k_b_proj": (2, 0, 1),  # used for MLA kernel
+            r"v_b_proj": (2, 0, 1),  # used for MLA kernel
+            r"o_proj": (1, 0),
             # moe
             r"mlp\.gate\.weight": (1, 0),
             r"mlp\.experts\.\d+\.gate_proj": (0, 2, 1),
@@ -388,13 +455,6 @@ class DeepSeekV3WeightLoader:
             # lm_head
             r"lm_head\.weight": (1, 0)
         }
-        self._weight_shape_map = {
-            "q_b_proj":
-            (attn_heads, qk_nope_head_dim + qk_rope_head_dim, q_lora_rank),
-            "kv_b_proj":
-            (attn_heads, qk_nope_head_dim + v_head_dim, kv_lora_rank),
-            "o_proj": (hidden_size, attn_heads, v_head_dim)
-        }
         # Set the mappings from loaded parameter keys to standardized names.
         self._loaded_to_standardized_keys = {
@@ -419,13 +479,13 @@ class DeepSeekV3WeightLoader:
             "model.layers.*.self_attn.q_a_proj.weight":
             "layers.*.attn.kernel_q_down_proj_DA",
             "model.layers.*.self_attn.q_b_proj.weight":
-            "layers.*.attn.kernel_q_up_proj_ANH",
+            "layers.*.attn.kernel_q_up_proj_AP",
             "model.layers.*.self_attn.kv_a_proj_with_mqa.weight":
             "layers.*.attn.kernel_kv_down_proj_DA",
             "model.layers.*.self_attn.kv_b_proj.weight":
-            "layers.*.attn.kernel_kv_up_proj_ANH",
+            "layers.*.attn.kernel_kv_up_proj_AL",
             "model.layers.*.self_attn.o_proj.weight":
-            "layers.*.attn.kernel_o_proj_NHD",
+            "layers.*.attn.kernel_o_proj_RD",
             # Dense ffw
             "model.layers.*.mlp.gate_proj.weight":
             "layers.*.custom_module.kernel_gating_DF",
@@ -452,57 +512,50 @@ class DeepSeekV3WeightLoader:
             "model.layers.*.mlp.shared_experts.up_proj.weight":
             "layers.*.shared_experts.kernel_up_proj_DF",
         }
-        # TODO (jacobplatin): we shouldn't hard-code this, but the logic to obtain the true quantized dtype
-        # is non-trivial and the default checkpoints all use this dtype
-        self.quant_dtype = jnp.float8_e4m3fn
+        if self.use_mla_kernel:
+            self._loaded_to_standardized_keys.update({
+                "model.layers.*.self_attn.k_b_proj.weight":
+                "layers.*.attn.kernel_k_up_proj_ANH",
+                "model.layers.*.self_attn.v_b_proj.weight":
+                "layers.*.attn.kernel_v_up_proj_ANH",
+            })
+        # TODO (jacobplatin): we should not be hard-coding these
+        self.scale_dtype, self.quant_dtype = jnp.bfloat16, jnp.float8_e4m3fn
         self.is_model_quantized = not vllm_config.additional_config.get(
             "skip_quantization", False)
-        if self.is_model_quantized:
-            # TODO (jacobplatin): expand support eventually
-            quantization_type = vllm_config.model_config.hf_config.quantization_config[
-                "quant_method"]
-            assert quantization_type == "fp8", "DeepSeek only supports the fp8 quantization method for now"
-            self.scale_dtype, self.quant_dtype = get_quant_dtype_from_qwix_config(
-                vllm_config)
-            logger.info(
-                f"Quantizing DeepSeek with quantization dtype: {self.quant_dtype} and scale dtype: {self.scale_dtype}"
-            )
-            quantization_block_sizes = vllm_config.model_config.hf_config.quantization_config[
-                "weight_block_size"]
-            assert len(
-                quantization_block_sizes
-            ) == 2, f"Expected only 2 quantization block sizes but got {quantization_block_sizes}"
-            self.quantization_block_size_n = quantization_block_sizes[0]
-            self.quantization_block_size_k = quantization_block_sizes[1]
-            # TODO (jacobplatin): remove this check in the future
-            assert self.quantization_block_size_n == self.quantization_block_size_k, "Quantization block size n and k must be the same!"
-            # NOTE: this is only needed for pre-quantized models
-            self._scale_shape_map = {
-                "q_b_proj": (1, qk_nope_head_dim + qk_rope_head_dim,
-                             q_lora_rank // self.quantization_block_size_n),
-                "kv_b_proj": (attn_heads, (qk_nope_head_dim + v_head_dim) //
-                              self.quantization_block_size_n,
-                              kv_lora_rank // self.quantization_block_size_n),
-                "o_proj":
-                (hidden_size // self.quantization_block_size_n, attn_heads,
-                 v_head_dim // self.quantization_block_size_n),
-            }
+        if self.is_model_quantized:
             # NOTE: this is only needed for pre-quantized models when doing random weight loading
+            # because the scales that Qwix configures by default don't necessarily match the
+            # scales in practice
             # TODO (jacobplatin): remove or clean this up
-            self.scale_shap_map_for_random_weight_loading = {
-                "kernel_kv_down_proj_DA": (56, 576),
-                "kernel_kv_up_proj_ANH": (4, 128, 2),
-                "kernel_q_up_proj_ANH": (12, 1, 192),
-                "kernel_o_proj_NHD": (128, 1, 56),
-                "kernel_down_proj_EFD": (256, 16, 56),
-                "kernel_up_proj_EDF": (256, 56, 16),
-                "kernel_gating_EDF": (256, 56, 16),
+            self.scale_shape_map_for_random_weight_loading = {
+                # MoE experts (3D)
+                "custom_module.kernel_down_proj_EFD": (256, 8, 7168),
+                "custom_module.kernel_gating_EDF": (256, 28, 2048),
+                "custom_module.kernel_up_proj_EDF": (256, 28, 2048),
+                # Shared experts (2D)
+                "shared_experts.kernel_down_proj_FD": (8, 7168),
+                "shared_experts.kernel_gating_DF": (28, 2048),
+                "shared_experts.kernel_up_proj_DF": (28, 2048),
+                # Dense FFW (2D)
+                "custom_module.kernel_gating_DF": (28, 18432),
+                "custom_module.kernel_up_proj_DF": (28, 18432),
+                "custom_module.kernel_down_proj_FD": (72, 7168),
+                # Attention (3D for MLA, 2D for the rest)
+                "attn.kernel_q_down_proj_DA": (28, 1536),
+                "attn.kernel_q_up_proj_AP": (6, 24576),
+                "attn.kernel_kv_down_proj_DA": (28, 576),
+                "attn.kernel_kv_up_proj_AL": (2, 32768),
+                "attn.kernel_o_proj_RD": (64, 7168),
+                "attn.kernel_k_up_proj_ANH": (2, 128, 128),  # MLA
+                "attn.kernel_v_up_proj_ANH": (2, 128, 128),  # MLA
             }
+            # TODO (jacobplatin): remove this check eventually!
+            assert self.quant_dtype == jnp.float8_e4m3fn, f"Expected quant_dtype to be float8_e4m3fn for DeepSeek but got {self.quant_dtype}"
     def map_loaded_to_standardized_name(self, loaded_key: str) -> str:
         # Find the corresponding model key using the HF key
         if "layer" in loaded_key:
@@ -580,45 +633,56 @@ class DeepSeekV3WeightLoader:
             base_model_weight, "array") else base_model_weight.sharding
         # Convert weights from torch into numpy
-        cast_type = model_weight.value.dtype
-        torch_view_type = DTYPE_VIEW_MAP.get(jnp.dtype(cast_type))
-        if torch_view_type:
-            # Avoid unnecessary upcasting and mem copy by viewing the tensor's
-            # raw data as integers before converting to a JAX array.
-            weight_np = jnp.array(
-                weight.view(torch_view_type).numpy()).view(cast_type)
+        if weight.dtype == torch.uint8 and scale is not None:
+            # Assume packed FP4 format when uint8 weights with scale provided
+            weight_jax_u8 = jnp.array(weight.cpu().numpy())
+            weight_np = u8_unpack_e2m1(weight_jax_u8)
+            scale = scale.to(torch.float32).numpy().astype(self.scale_dtype)
         else:
-            raise ValueError(
-                f"Unsupported dtype for tensor conversion: {cast_type}")
+            cast_type = model_weight.value.dtype
+            # Special-case: FP4 values stored as FP8 for compatibility.
+            # If the model expects float4_e2m1fn but the checkpoint provides FP8,
+            # convert by numeric value (float32) then cast to float4.
+            if cast_type == jnp.float4_e2m1fn and weight.dtype == torch.float8_e4m3fn:
+                weight_np = jnp.array(weight.float().numpy()).astype(cast_type)
+            else:
+                torch_view_type = DTYPE_VIEW_MAP.get(jnp.dtype(cast_type))
-        if scale is not None:
-            scale = scale.to(torch.float32).numpy().astype(self.scale_dtype)
+                if torch_view_type:
+                    # Avoid unnecessary upcasting and mem copy by viewing the tensor's
+                    # raw data as integers before converting to a JAX array.
+                    weight_np = jnp.array(
+                        weight.view(torch_view_type).numpy()).view(cast_type)
+                else:
+                    raise ValueError(
+                        f"Unsupported dtype for tensor conversion: {cast_type}"
+                    )
-        # Reshape and transpose weights if necessary.
-        weight_np = reshape_params(name, weight_np, self._weight_shape_map)
-        if scale is not None:
-            scale = reshape_params(name, scale, self._scale_shape_map)
+            if scale is not None:
+                scale = scale.to(torch.float32).numpy().astype(
+                    self.scale_dtype)
         weight_np = self._transpose_params(name, weight_np)
         if scale is not None:
             scale = self._transpose_params(name, scale)
+            # Ensure scale is broadcastable to weight_np by repeating per-axis.
             weight_shape = weight_np.shape
             scale_shape = scale.shape
-            assert len(weight_shape) == len(scale_shape)
-            for idx, (weight_dim,
-                      scale_dim) in enumerate(zip(weight_shape, scale_shape)):
-                if weight_dim // self.quantization_block_size_n != scale_dim and weight_dim // scale_dim != 1:
-                    old_scale_shape = scale.shape
-                    scale = scale.repeat(self.quantization_block_size_n,
-                                         axis=idx)[:, :weight_dim]
+            if len(weight_shape) == len(scale_shape):
+                new_scale = scale
+                for wdim, sdim in zip(weight_shape, scale_shape):
+                    if (wdim % sdim != 0):
+                        raise ValueError(
+                            f"Weight dim {wdim} is not divisible by scale dim {sdim} for weight {name} with shape {weight_shape} and scale {scale_shape}!"
+                        )
+                if scale_shape != new_scale.shape:
                     logger.warning(
-                        f"Got a weight with shape {weight_shape} and scale with shape {old_scale_shape} "
-                        f"where the scale_dim {scale_dim} does not match the weight_dim {weight_dim} "
-                        f"multiplied by the quantization block size {self.quantization_block_size_n}. "
-                        f"Repeating the scale to new shape {scale.shape} along axis {idx} with repeat size {self.quantization_block_size_n}."
+                        f"Adjusted scale shape {scale_shape} to {new_scale.shape} to match weight {weight_shape}"
                     )
-                    break
+                scale = new_scale
+            else:
+                raise ValueError(
+                    f"Scale rank {scale_shape} does not match weight rank {weight_shape}"
+                )
         if model_weight.value.shape != weight_np.shape:
             raise ValueError(
@@ -652,10 +716,8 @@ class DeepSeekV3WeightLoader:
                 logger.warning(
                     f"Could not create sharded scale for {name} with shape {scale.shape} and sharding {sharding}, skipping sharding..."
                 )
-            # NOTE: Despite the fact that scale has the name `scale_inv` in it, we don't need to
-            # inverse it
-            assert base_model_weight.array.scale.value.dtype == maybe_sharded_scale.dtype, "Expected dtype for model weight scale with name {mapped_name} and dtype ({base_model_weight.array.scale.value.dtype}) to match that of the incoming weight scale ({maybe_sharded_scale.dtype})"
-            assert base_model_weight.array.qvalue.value.dtype == sharded_array.dtype, "Expected dtype for model weight with name {mapped_name} and dtype ({base_model_weight.array.qvalue.value.dtype}) to match that of the incoming weight ({sharded_array.dtype})"
+            assert base_model_weight.array.scale.value.dtype == maybe_sharded_scale.dtype, f"Expected dtype for model weight scale with name {mapped_name} and dtype ({base_model_weight.array.scale.value.dtype}) to match that of the incoming weight scale ({maybe_sharded_scale.dtype})"
+            assert base_model_weight.array.qvalue.value.dtype == sharded_array.dtype, f"Expected dtype for model weight with name {mapped_name} and dtype ({base_model_weight.array.qvalue.value.dtype}) to match that of the incoming weight ({sharded_array.dtype})"
             base_model_weight.array.scale.value = maybe_sharded_scale
             base_model_weight.array.qvalue.value = sharded_array
         else:
@@ -721,7 +783,11 @@ class DeepSeekV3WeightLoader:
                 # TODO (jacobplatin): refactor this so that we instead change / update `model_weights_generator`
                 # instead of checking "weight_scale_inv" and assuming quantization method is fp8
                 scale = None
-                if loaded_weight.dtype == j2t_dtype(self.quant_dtype.dtype):
+                # Mixed quantization: accept both fp8 and packed fp4 (uint8) tensors
+                allowed_quant_dtypes = {
+                    j2t_dtype(self.quant_dtype.dtype), torch.uint8
+                }
+                if loaded_weight.dtype in allowed_quant_dtypes:
                     if self.is_model_quantized:
                         scale_name = loaded_name.replace(
                             ".weight", ".weight_scale_inv")
@@ -802,21 +868,65 @@ class DeepSeekV3WeightLoader:
                                 f"Cumulative local memory: {cumulative_local_memory} GB"
                             )
                 else:
-                    weight_bytes, weight_shards = self._load_individual_weight(
-                        loaded_name,
-                        loaded_weight,
-                        model_params,
-                        model_for_loading.mesh,
-                        scale=scale)
-                    if self.is_verbose:
-                        cumulative_global_memory += weight_bytes
-                        cumulative_local_memory += weight_shards
-                        logger.info(
-                            f"Cumulative global memory: {cumulative_global_memory} GB"
-                        )
-                        logger.info(
-                            f"Cumulative local memory: {cumulative_local_memory} GB"
-                        )
+                    if self.use_mla_kernel and "kv_b_proj" in loaded_name:
+                        # loaded_weight shape: (num_heads * (d_k + d_v), kv_lora_rank)
+                        # scale shape: (num_heads * (d_k + d_v) / block_n, kv_lora_rank / block_k)
+                        # Reshape to (num_heads, (d_k + d_v), kv_lora_rank) and split
+                        weight_reshaped = loaded_weight.view(
+                            self.attn_heads,
+                            self.qk_nope_head_dim + self.v_head_dim,
+                            self.kv_lora_rank)
+                        k_weight = weight_reshaped[:, :self.
+                                                   qk_nope_head_dim, :]
+                        v_weight = weight_reshaped[:,
+                                                   self.qk_nope_head_dim:, :]
+                        loaded_weights_list = [k_weight, v_weight]
+                        loaded_names = [
+                            loaded_name.replace("kv_b_proj", "k_b_proj"),
+                            loaded_name.replace("kv_b_proj", "v_b_proj")
+                        ]
+                        scales_list = [None, None]
+                        if scale is not None:
+                            assert loaded_weight.shape[0] == scale.shape[0]
+                            block_size_k = loaded_weight.shape[
+                                1] // scale.shape[1]
+                            assert block_size_k > 0, f"Expected non-zero block size but got {block_size_k}!"
+                            scale_reshaped = scale.view(
+                                self.attn_heads,
+                                (self.qk_nope_head_dim + self.v_head_dim),
+                                self.kv_lora_rank // block_size_k)
+                            k_scale = scale_reshaped[:, :self.
+                                                     qk_nope_head_dim, :]
+                            v_scale = scale_reshaped[:,
+                                                     self.qk_nope_head_dim:, :]
+                            scales_list = [k_scale, v_scale]
+                    else:
+                        loaded_weights_list = [loaded_weight]
+                        loaded_names = [loaded_name]
+                        scales_list = [scale]
+                    for loaded_name, loaded_weight, scale in zip(
+                            loaded_names, loaded_weights_list, scales_list):
+                        weight_bytes, weight_shards = self._load_individual_weight(
+                            loaded_name,
+                            loaded_weight,
+                            model_params,
+                            model_for_loading.mesh,
+                            scale=scale)
+                        if self.is_verbose:
+                            cumulative_global_memory += weight_bytes
+                            cumulative_local_memory += weight_shards
+                            logger.info(
+                                f"Cumulative global memory: {cumulative_global_memory} GB"
+                            )
+                            logger.info(
+                                f"Cumulative local memory: {cumulative_local_memory} GB"
+                            )
         del mlp_experts_gate_proj_weights
         del mlp_experts_up_proj_weights

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl