PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tpu_inference/models/jax/deepseek_v3.py ADDED Viewed

@@ -0,0 +1,868 @@
+import re
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import jax
+import jax.numpy as jnp
+import torch
+from flax import nnx
+from flax.typing import PRNGKey
+from jax.sharding import Mesh, NamedSharding
+from jax.sharding import PartitionSpec as P
+from torchax.ops.mappings import j2t_dtype
+from vllm.config import VllmConfig
+from tpu_inference import utils
+from tpu_inference.layers.jax.attention.attention import AttentionMetadata
+from tpu_inference.layers.jax.attention.deepseek_v3_attention import MLA
+from tpu_inference.layers.jax.constants import KVCacheType
+from tpu_inference.layers.jax.layers import DenseFFW, Embedder, LMhead, RMSNorm
+from tpu_inference.layers.jax.moe.deepseek_v3_moe import (DeepSeekV3Router,
+                                                          SparseMoE)
+from tpu_inference.layers.jax.moe.moe import MoE
+from tpu_inference.layers.jax.transformer_block import (
+    SharedExpertsTransformerBlock, TransformerBlock)
+from tpu_inference.logger import init_logger
+from tpu_inference.models.jax.utils.quantization.quantization_utils import \
+    get_quant_dtype_from_qwix_config
+from tpu_inference.models.jax.utils.weight_utils import (
+    get_param, model_weights_generator, print_param_info, reshape_params)
+logger = init_logger(__name__)
+# A map from JAX dtype to the corresponding PyTorch integer dtype for raw memory viewing.
+DTYPE_VIEW_MAP = {
+    jnp.dtype(jnp.float8_e4m3fn): torch.uint8,
+    jnp.dtype(jnp.bfloat16): torch.uint16,
+    jnp.dtype(jnp.float32): torch.uint32,
+}
+@dataclass
+class DeepSeekV3(nnx.Module):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 rng: jax.Array,
+                 mesh: Mesh,
+                 force_random_weights: bool = False):
+        assert mesh is not None
+        self.vllm_config = vllm_config
+        self.rng = nnx.Rngs(rng)
+        # NOTE: the default is 61
+        num_layers: int = vllm_config.model_config.hf_config.num_hidden_layers
+        num_local_experts: int = 256
+        vocab_size: int = 129280
+        hidden_size: int = 7168
+        # NOTE: this dtype may be implicitly overriden if using to Qwix to load in the quantized weights
+        dtype: jnp.dtype = jnp.bfloat16
+        num_attention_heads: int = 128
+        num_key_value_heads: int = 128
+        ffw_intermediate_size: int = 18432
+        moe_intermediate_size: int = 2048
+        num_experts_per_token: int = 8
+        n_group: int = 8
+        interleave_moe_layer_step: int = 1  # Deepseek V3 has moe_layer_freq=1 in hf config.
+        hidden_act: str = "silu"
+        rms_norm_eps: float = 1e-06
+        first_k_dense_replace: int = 3  # replace the first few MOE layers to dense layer.
+        num_shared_experts = 1
+        rope_theta = 10000
+        rope_scaling = {
+            "beta_fast": 32,
+            "beta_slow": 1,
+            "factor": 40,
+            "mscale": 1.0,
+            "mscale_all_dim": 1.0,
+            "original_max_position_embeddings": 4096,
+            "type": "yarn"
+        }
+        q_lora_rank = 1536
+        kv_lora_rank = 512
+        qk_nope_head_dim = 128
+        qk_rope_head_dim = 64
+        v_head_dim = 128
+        self.random_init = force_random_weights or self.vllm_config.additional_config.get(
+            "random_weights", False)
+        self.sparse_matmul = self.vllm_config.additional_config.get(
+            "sparse_matmul", False)
+        if isinstance(self.sparse_matmul, str):
+            self.sparse_matmul = self.sparse_matmul.lower() == "true"
+        else:
+            self.sparse_matmul = bool(self.sparse_matmul)
+        if self.sparse_matmul:
+            logger.info("sparse matmul is enabled")
+        else:
+            logger.info("sparse matmul is disabled, using dense matmul")
+        self.mesh = mesh
+        self.weight_loader = DeepSeekV3WeightLoader(
+            vllm_config=vllm_config,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            q_lora_rank=q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            attn_heads=num_attention_heads,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            num_local_experts=num_local_experts,
+            model_dtype=dtype)
+        self.embedder = Embedder(vocab_size=vocab_size,
+                                 hidden_size=hidden_size,
+                                 dtype=dtype,
+                                 rngs=self.rng,
+                                 vd_sharding=(('data', 'expert', 'model'),
+                                              None),
+                                 random_init=self.random_init)
+        self.layers = []
+        def _create_mla() -> MLA:
+            return MLA(
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                q_lora_rank=q_lora_rank,
+                kv_lora_rank=kv_lora_rank,
+                qk_nope_head_dim=qk_nope_head_dim,
+                qk_rope_head_dim=qk_rope_head_dim,
+                rms_norm_eps=rms_norm_eps,
+                v_head_dim=v_head_dim,
+                mesh=self.mesh,
+                random_init=self.random_init,
+                hidden_size=hidden_size,
+                num_attention_heads=num_attention_heads,
+                num_key_value_heads=num_key_value_heads,
+                head_dim=v_head_dim,  # MLA uses v_head_dim as head_dim
+                dtype=dtype,
+                # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
+                kv_cache_dtype=vllm_config.cache_config.cache_dtype,
+                rngs=self.rng,
+                activation_attention_td=(None, None),
+                activation_q_td=(None, None),
+                query_tnh=P(None, 'model', None),
+                keyvalue_skh=P(None, 'model', None),
+                activation_attention_out_td=(None, None),
+                attn_o_tnh=P(None, 'model', None),
+                q_da_sharding=(None, 'model'),
+                anh_sharding=(None, 'model', None),
+                kv_da_sharding=(None, 'model'),
+                nhd_sharding=('model', None, None))
+        for i in range(first_k_dense_replace):
+            block = TransformerBlock(
+                pre_attention_norm=RMSNorm(
+                    dims=hidden_size,
+                    random_init=self.random_init,
+                    epsilon=rms_norm_eps,
+                    with_scale=True,
+                    dtype=dtype,
+                    rngs=self.rng,
+                ),
+                pre_mlp_norm=RMSNorm(
+                    dims=hidden_size,
+                    random_init=self.random_init,
+                    epsilon=rms_norm_eps,
+                    with_scale=True,
+                    dtype=dtype,
+                    rngs=self.rng,
+                ),
+                attn=_create_mla(),
+                custom_module=DenseFFW(dtype=dtype,
+                                       hidden_act=hidden_act,
+                                       hidden_size=hidden_size,
+                                       intermediate_size=ffw_intermediate_size,
+                                       rngs=self.rng,
+                                       df_sharding=(None, ('model', 'expert')),
+                                       fd_sharding=(('model', 'expert'), None),
+                                       random_init=self.random_init))
+            self.layers.append(block)
+        for i in range(first_k_dense_replace, num_layers):
+            is_moe_layer = ((i + 1) % interleave_moe_layer_step == 0)
+            router = DeepSeekV3Router(
+                random_init=self.random_init,
+                hidden_size=hidden_size,
+                num_experts=num_local_experts,
+                num_experts_per_tok=num_experts_per_token,
+                n_groups=n_group,
+                topk_groups=4,
+                norm_topk_prob=True,
+                rngs=self.rng,
+                routed_scaling_factor=2.5,
+                dtype=dtype,
+                activation_ffw_td=('data', None),
+                ed_sharding=('model', None),
+                e_sharding=('model', ))
+            if self.sparse_matmul:
+                # TODO: orginize the SparseMoE and DenseMoE better given they share most interfaces
+                custom_module = SparseMoE(
+                    dtype=dtype,
+                    num_local_experts=num_local_experts,
+                    apply_expert_weight_before_computation=False,
+                    hidden_size=hidden_size,
+                    intermediate_size_moe=moe_intermediate_size,
+                    num_experts_per_tok=num_experts_per_token,
+                    mesh=self.mesh,
+                    hidden_act=hidden_act,
+                    rngs=self.rng,
+                    random_init=self.random_init,
+                    activation_ffw_td=('data', None),
+                    activation_ffw_ted=('data', None, None),
+                    edf_sharding=('model', None, None),
+                    efd_sharding=('model', None, None),
+                    quantized_dtype=self.weight_loader.quant_dtype
+                    if self.weight_loader.is_model_quantized else None,
+                    router=router) if is_moe_layer else DenseFFW(
+                        dtype=dtype,
+                        hidden_act=hidden_act,
+                        hidden_size=hidden_size,
+                        intermediate_size=ffw_intermediate_size,
+                        rngs=self.rng,
+                        random_init=self.random_init,
+                        df_sharding=(None, ('model', 'expert')),
+                        fd_sharding=(('model', 'expert'), None))
+            else:
+                custom_module = MoE(
+                    dtype=dtype,
+                    num_local_experts=num_local_experts,
+                    apply_expert_weight_before_computation=False,
+                    hidden_size=hidden_size,
+                    intermediate_size_moe=moe_intermediate_size,
+                    hidden_act=hidden_act,
+                    rngs=self.rng,
+                    random_init=self.random_init,
+                    activation_ffw_td=('data', None),
+                    activation_ffw_ted=('data', None, None),
+                    edf_sharding=('model', None, None),
+                    efd_sharding=('model', None, None),
+                    router=router) if is_moe_layer else DenseFFW(
+                        dtype=dtype,
+                        hidden_act=hidden_act,
+                        hidden_size=hidden_size,
+                        intermediate_size=ffw_intermediate_size,
+                        rngs=self.rng,
+                        random_init=self.random_init,
+                        df_sharding=(None, ('model', 'expert')),
+                        fd_sharding=(('model', 'expert'), None))
+            shared_experts = DenseFFW(dtype=dtype,
+                                      hidden_act=hidden_act,
+                                      hidden_size=hidden_size,
+                                      intermediate_size=num_shared_experts *
+                                      moe_intermediate_size,
+                                      rngs=self.rng,
+                                      random_init=self.random_init,
+                                      df_sharding=(None, ('model', 'expert')),
+                                      fd_sharding=(('model', 'expert'), None))
+            pre_attention_norm = RMSNorm(
+                dims=hidden_size,
+                rngs=self.rng,
+                random_init=self.random_init,
+                epsilon=rms_norm_eps,
+                with_scale=True,
+                dtype=dtype,
+            )
+            pre_mlp_norm = RMSNorm(
+                dims=hidden_size,
+                rngs=self.rng,
+                random_init=self.random_init,
+                epsilon=rms_norm_eps,
+                with_scale=True,
+                dtype=dtype,
+            )
+            block = SharedExpertsTransformerBlock(
+                custom_module=custom_module,
+                attn=_create_mla(),
+                pre_attention_norm=pre_attention_norm,
+                pre_mlp_norm=pre_mlp_norm,
+                shared_experts=shared_experts)
+            self.layers.append(block)
+        self.final_norm = RMSNorm(
+            dims=hidden_size,
+            rngs=self.rng,
+            random_init=self.random_init,
+            epsilon=rms_norm_eps,
+            with_scale=True,
+            dtype=dtype,
+        )
+        self.lm_head = LMhead(vocab_size=vocab_size,
+                              hidden_size=hidden_size,
+                              dtype=dtype,
+                              rngs=self.rng,
+                              vd_sharding=(('data', 'expert', 'model'), None),
+                              dv_sharding=(None, ('data', 'expert', 'model')),
+                              random_init=self.random_init)
+    # For compatibility with flax.
+    def apply(self, variables, *args, **kwargs):
+        return self.__call__(*args, **kwargs)
+    def load_weights(self, rng: PRNGKey, cache_dir: Optional[str] = None):
+        # NOTE: Since we are using nnx.eval_shape to init the model,
+        # we have to pass dynamic arrays here for __call__'s usage.
+        self.rng = nnx.Rngs(rng)
+        self.weight_loader.load_weights(self)
+        self.initialize_cache()
+    def initialize_cache(self):
+        # Initialize RoPE caches after weights are loaded and before JIT compilation.
+        for layer in self.layers:
+            if hasattr(layer, 'attn') and hasattr(layer.attn, 'rope'):
+                if hasattr(layer.attn.rope, 'initialize_cache'):
+                    layer.attn.rope.initialize_cache()
+    def __call__(
+        self,
+        kv_caches: List[jax.Array],
+        input_ids: jax.Array,
+        attention_metadata: AttentionMetadata,
+        *args,
+    ) -> Tuple[List[KVCacheType], jax.Array, List[jax.Array]]:
+        is_prefill = False
+        x = self.embedder.encode(input_ids)
+        for (i, block) in enumerate(self.layers):
+            kv_cache = kv_caches[i]
+            new_kv_cache, x = block(x, is_prefill, kv_cache,
+                                    attention_metadata)
+            kv_caches[i] = new_kv_cache
+        final_activation = self.final_norm(x)
+        return kv_caches, final_activation, []
+    def compute_logits(self, hidden_states: jax.Array) -> jax.Array:
+        return self.lm_head.decode(hidden_states)
+@dataclass
+class DeepSeekV3WeightLoader:
+    def __init__(self, vllm_config: VllmConfig, num_layers, hidden_size,
+                 q_lora_rank, kv_lora_rank, attn_heads, qk_nope_head_dim,
+                 qk_rope_head_dim, v_head_dim, num_local_experts, model_dtype):
+        self.num_layers = num_layers
+        self.names_and_weights_generator = model_weights_generator(
+            model_name_or_path=vllm_config.model_config.model,
+            framework="pt",
+            download_dir=vllm_config.load_config.download_dir)
+        self.is_verbose = vllm_config.additional_config.get(
+            "is_verbose", None) is not None
+        self.num_routed_experts = num_local_experts
+        self.model_dtype = model_dtype
+        self._transpose_map = {
+            # dense mlp
+            r"mlp\.down_proj": (1, 0),
+            r"mlp\.gate_proj": (1, 0),
+            r"mlp\.up_proj": (1, 0),
+            # mla
+            r"q_a_proj": (1, 0),
+            r"q_b_proj": (2, 0, 1),
+            r"kv_a_proj_with_mqa": (1, 0),
+            r"kv_b_proj": (2, 0, 1),
+            r"o_proj": (1, 2, 0),
+            # moe
+            r"mlp\.gate\.weight": (1, 0),
+            r"mlp\.experts\.\d+\.gate_proj": (0, 2, 1),
+            r"mlp\.experts\.\d+\.down_proj": (0, 2, 1),
+            r"mlp\.experts\.\d+\.up_proj": (0, 2, 1),
+            r"mlp\.shared_experts\.down_proj": (1, 0),
+            r"mlp\.shared_experts\.gate_proj": (1, 0),
+            r"mlp\.shared_experts\.up_proj": (1, 0),
+            # lm_head
+            r"lm_head\.weight": (1, 0)
+        }
+        self._weight_shape_map = {
+            "q_b_proj":
+            (attn_heads, qk_nope_head_dim + qk_rope_head_dim, q_lora_rank),
+            "kv_b_proj":
+            (attn_heads, qk_nope_head_dim + v_head_dim, kv_lora_rank),
+            "o_proj": (hidden_size, attn_heads, v_head_dim)
+        }
+        # Set the mappings from loaded parameter keys to standardized names.
+        self._loaded_to_standardized_keys = {
+            # encode & decode
+            "model.embed_tokens.weight":
+            "embedder.input_embedding_table_VD",
+            "lm_head.weight":
+            "lm_head.input_embedding_table_DV",
+            # final norm
+            "model.norm.weight":
+            "final_norm.scale",
+            # norm in transformer blocks
+            "model.layers.*.input_layernorm.weight":
+            "layers.*.pre_attention_norm.scale",
+            "model.layers.*.post_attention_layernorm.weight":
+            "layers.*.pre_mlp_norm.scale",
+            # attention (MLA)
+            "model.layers.*.self_attn.q_a_layernorm.weight":
+            "layers.*.attn.q_rms_norm.scale",
+            "model.layers.*.self_attn.kv_a_layernorm.weight":
+            "layers.*.attn.kv_rms_norm.scale",
+            "model.layers.*.self_attn.q_a_proj.weight":
+            "layers.*.attn.kernel_q_down_proj_DA",
+            "model.layers.*.self_attn.q_b_proj.weight":
+            "layers.*.attn.kernel_q_up_proj_ANH",
+            "model.layers.*.self_attn.kv_a_proj_with_mqa.weight":
+            "layers.*.attn.kernel_kv_down_proj_DA",
+            "model.layers.*.self_attn.kv_b_proj.weight":
+            "layers.*.attn.kernel_kv_up_proj_ANH",
+            "model.layers.*.self_attn.o_proj.weight":
+            "layers.*.attn.kernel_o_proj_NHD",
+            # Dense ffw
+            "model.layers.*.mlp.gate_proj.weight":
+            "layers.*.custom_module.kernel_gating_DF",
+            "model.layers.*.mlp.up_proj.weight":
+            "layers.*.custom_module.kernel_up_proj_DF",
+            "model.layers.*.mlp.down_proj.weight":
+            "layers.*.custom_module.kernel_down_proj_FD",
+            # MOE(routed experts)
+            "model.layers.*.mlp.gate.weight":
+            "layers.*.custom_module.router.kernel_DE",
+            "model.layers.*.mlp.gate.e_score_correction_bias":
+            "layers.*.custom_module.router.bias_E",
+            "model.layers.*.mlp.experts.*.gate_proj.weight":
+            "layers.*.custom_module.kernel_gating_EDF",
+            "model.layers.*.mlp.experts.*.down_proj.weight":
+            "layers.*.custom_module.kernel_down_proj_EFD",
+            "model.layers.*.mlp.experts.*.up_proj.weight":
+            "layers.*.custom_module.kernel_up_proj_EDF",
+            # MOE(shared experts)
+            "model.layers.*.mlp.shared_experts.down_proj.weight":
+            "layers.*.shared_experts.kernel_down_proj_FD",
+            "model.layers.*.mlp.shared_experts.gate_proj.weight":
+            "layers.*.shared_experts.kernel_gating_DF",
+            "model.layers.*.mlp.shared_experts.up_proj.weight":
+            "layers.*.shared_experts.kernel_up_proj_DF",
+        }
+        # TODO (jacobplatin): we shouldn't hard-code this, but the logic to obtain the true quantized dtype
+        # is non-trivial and the default checkpoints all use this dtype
+        self.quant_dtype = jnp.float8_e4m3fn
+        self.is_model_quantized = not vllm_config.additional_config.get(
+            "skip_quantization", False)
+        if self.is_model_quantized:
+            # TODO (jacobplatin): expand support eventually
+            quantization_type = vllm_config.model_config.hf_config.quantization_config[
+                "quant_method"]
+            assert quantization_type == "fp8", "DeepSeek only supports the fp8 quantization method for now"
+            self.scale_dtype, self.quant_dtype = get_quant_dtype_from_qwix_config(
+                vllm_config)
+            logger.info(
+                f"Quantizing DeepSeek with quantization dtype: {self.quant_dtype} and scale dtype: {self.scale_dtype}"
+            )
+            quantization_block_sizes = vllm_config.model_config.hf_config.quantization_config[
+                "weight_block_size"]
+            assert len(
+                quantization_block_sizes
+            ) == 2, f"Expected only 2 quantization block sizes but got {quantization_block_sizes}"
+            self.quantization_block_size_n = quantization_block_sizes[0]
+            self.quantization_block_size_k = quantization_block_sizes[1]
+            # TODO (jacobplatin): remove this check in the future
+            assert self.quantization_block_size_n == self.quantization_block_size_k, "Quantization block size n and k must be the same!"
+            # NOTE: this is only needed for pre-quantized models
+            self._scale_shape_map = {
+                "q_b_proj": (1, qk_nope_head_dim + qk_rope_head_dim,
+                             q_lora_rank // self.quantization_block_size_n),
+                "kv_b_proj": (attn_heads, (qk_nope_head_dim + v_head_dim) //
+                              self.quantization_block_size_n,
+                              kv_lora_rank // self.quantization_block_size_n),
+                "o_proj":
+                (hidden_size // self.quantization_block_size_n, attn_heads,
+                 v_head_dim // self.quantization_block_size_n),
+            }
+            # NOTE: this is only needed for pre-quantized models when doing random weight loading
+            # TODO (jacobplatin): remove or clean this up
+            self.scale_shap_map_for_random_weight_loading = {
+                "kernel_kv_down_proj_DA": (56, 576),
+                "kernel_kv_up_proj_ANH": (4, 128, 2),
+                "kernel_q_up_proj_ANH": (12, 1, 192),
+                "kernel_o_proj_NHD": (128, 1, 56),
+                "kernel_down_proj_EFD": (256, 16, 56),
+                "kernel_up_proj_EDF": (256, 56, 16),
+                "kernel_gating_EDF": (256, 56, 16),
+            }
+    def map_loaded_to_standardized_name(self, loaded_key: str) -> str:
+        # Find the corresponding model key using the HF key
+        if "layer" in loaded_key:
+            # extract layer number and replace it with *
+            layer_num = re.search(r"layers\.(\d+)", loaded_key).group(1)
+            layer_key = re.sub(r"layers\.\d+", "layers.*", loaded_key)
+            # extract expert number if exists and replace it with *
+            if "experts" in loaded_key and "shared_experts" not in loaded_key:
+                layer_key = re.sub(r"experts\.\d+", "experts.*", layer_key)
+            # get standardized key and replace * with layer number.
+            mapped_key = self._loaded_to_standardized_keys.get(
+                layer_key, loaded_key)
+            mapped_key = re.sub(r"layers\.\*", f"layers.{layer_num}",
+                                mapped_key)
+        else:
+            mapped_key = self._loaded_to_standardized_keys.get(
+                loaded_key, loaded_key)
+        return mapped_key
+    def _transpose_params(self, param_key: str, param_tensor: jax.Array):
+        for key, value in self._transpose_map.items():
+            if re.search(key, param_key):
+                return jnp.transpose(param_tensor, value)
+        return param_tensor  # Base case / no-op
+    def _process_moe_weights(self, loaded_name, loaded_weight, weights_dict):
+        layer_num = re.search(r"layers\.(\d+)", loaded_name).group(1)
+        expert_num_str = re.search(r"experts\.(\d+)", loaded_name).group(1)
+        expert_idx = int(expert_num_str)
+        if layer_num not in weights_dict:
+            weights_dict[layer_num] = ([None] * self.num_routed_experts, 0)
+        expert_list, count = weights_dict[layer_num]
+        expert_list[expert_idx] = loaded_weight
+        count += 1
+        weights_dict[layer_num] = (expert_list, count)
+        if count == self.num_routed_experts:
+            stacked_weights = torch.stack(expert_list, axis=0)
+            del weights_dict[layer_num]
+            return stacked_weights
+        return None
+    def _load_individual_weight(self,
+                                name,
+                                weight,
+                                model_params,
+                                model_mesh,
+                                scale=None) -> Tuple[int, int]:
+        """
+        Loads a single weight into the model.
+        NOTE: if using the base quantized model, it is assumed that the Qwix abstract
+        pass has been run and that the model weights are thus QArrays, which we
+        will then load the weights/scales into.
+        Args:
+            name: The name of the weight.
+            weight: The weight to load.
+            model_params: The model parameters.
+            model_mesh: The model mesh.
+            scale: The scale of the weight (if using the pre-quantized model).
+        Returns:
+            Tuple[int, int]: The size (in bytes) for the given layer overall and per shard.
+                NOTE: if using the pre-quantized model (with Qwix), we'll include the scale size as well.
+        """
+        mapped_name = self.map_loaded_to_standardized_name(name)
+        base_model_weight = get_param(model_params, mapped_name)
+        model_weight = base_model_weight.array.qvalue if hasattr(
+            base_model_weight, "array") else base_model_weight
+        sharding = base_model_weight.array.qvalue.sharding if hasattr(
+            base_model_weight, "array") else base_model_weight.sharding
+        # Convert weights from torch into numpy
+        cast_type = model_weight.value.dtype
+        torch_view_type = DTYPE_VIEW_MAP.get(jnp.dtype(cast_type))
+        if torch_view_type:
+            # Avoid unnecessary upcasting and mem copy by viewing the tensor's
+            # raw data as integers before converting to a JAX array.
+            weight_np = jnp.array(
+                weight.view(torch_view_type).numpy()).view(cast_type)
+        else:
+            raise ValueError(
+                f"Unsupported dtype for tensor conversion: {cast_type}")
+        if scale is not None:
+            scale = scale.to(torch.float32).numpy().astype(self.scale_dtype)
+        # Reshape and transpose weights if necessary.
+        weight_np = reshape_params(name, weight_np, self._weight_shape_map)
+        if scale is not None:
+            scale = reshape_params(name, scale, self._scale_shape_map)
+        weight_np = self._transpose_params(name, weight_np)
+        if scale is not None:
+            scale = self._transpose_params(name, scale)
+            weight_shape = weight_np.shape
+            scale_shape = scale.shape
+            assert len(weight_shape) == len(scale_shape)
+            for idx, (weight_dim,
+                      scale_dim) in enumerate(zip(weight_shape, scale_shape)):
+                if weight_dim // self.quantization_block_size_n != scale_dim and weight_dim // scale_dim != 1:
+                    old_scale_shape = scale.shape
+                    scale = scale.repeat(self.quantization_block_size_n,
+                                         axis=idx)[:, :weight_dim]
+                    logger.warning(
+                        f"Got a weight with shape {weight_shape} and scale with shape {old_scale_shape} "
+                        f"where the scale_dim {scale_dim} does not match the weight_dim {weight_dim} "
+                        f"multiplied by the quantization block size {self.quantization_block_size_n}. "
+                        f"Repeating the scale to new shape {scale.shape} along axis {idx} with repeat size {self.quantization_block_size_n}."
+                    )
+                    break
+        if model_weight.value.shape != weight_np.shape:
+            raise ValueError(
+                f"Loaded shape for {name}: {weight_np.shape} "
+                f"does not match model shape for {mapped_name}: {model_weight.value.shape}!"
+            )
+        def get_slice(index):
+            return weight_np[index]
+        def get_slice_scale(index):
+            # ruff: noqa: F821
+            return scale[index]
+        sharded_array = jax.make_array_from_callback(
+            weight_np.shape, NamedSharding(model_mesh, P(*sharding)),
+            get_slice)
+        if scale is not None:
+            maybe_sharded_scale = scale
+            # Since, by default, we'll use the same sharding as the weights, we might
+            # encounter an error where the smaller/different sharding dim isn't divisible
+            # by the parallel size
+            # NOTE: Qwix expert dangyi@ mentioned that we don't need to worry about accuracy
+            # impacts when sharing scales
+            try:
+                maybe_sharded_scale = jax.make_array_from_callback(
+                    scale.shape, NamedSharding(model_mesh, P(*sharding)),
+                    get_slice_scale)
+            except ValueError:
+                logger.warning(
+                    f"Could not create sharded scale for {name} with shape {scale.shape} and sharding {sharding}, skipping sharding..."
+                )
+            # NOTE: Despite the fact that scale has the name `scale_inv` in it, we don't need to
+            # inverse it
+            assert base_model_weight.array.scale.value.dtype == maybe_sharded_scale.dtype, "Expected dtype for model weight scale with name {mapped_name} and dtype ({base_model_weight.array.scale.value.dtype}) to match that of the incoming weight scale ({maybe_sharded_scale.dtype})"
+            assert base_model_weight.array.qvalue.value.dtype == sharded_array.dtype, "Expected dtype for model weight with name {mapped_name} and dtype ({base_model_weight.array.qvalue.value.dtype}) to match that of the incoming weight ({sharded_array.dtype})"
+            base_model_weight.array.scale.value = maybe_sharded_scale
+            base_model_weight.array.qvalue.value = sharded_array
+        else:
+            assert model_weight.value.dtype == sharded_array.dtype, f"Expected dtype for model weight with name {mapped_name} and dtype ({model_weight.value.dtype}) to match that of the incoming weight ({sharded_array.dtype})"
+            model_weight.value = sharded_array
+        model_weight_size_bytes = model_weight.nbytes / 1e9
+        model_weight_local_size_bytes = model_weight.addressable_shards[
+            0].data.nbytes / 1e9
+        if scale is not None:
+            model_weight_size_bytes += base_model_weight.array.scale.nbytes / 1e9
+            model_weight_local_size_bytes += base_model_weight.array.scale.addressable_shards[
+                0].data.nbytes / 1e9
+        if self.is_verbose:
+            logger.info(f"Memory usage after loading in {name}: "
+                        f"hbm={utils.hbm_usage_gb(jax.local_devices())}Gb")
+            print_param_info(model_weight, name)
+            if scale is not None:
+                print_param_info(base_model_weight.array.scale,
+                                 "scale for " + name)
+        del weight, scale
+        return model_weight_size_bytes, model_weight_local_size_bytes
+    def load_weights(self, model_for_loading: nnx.Module):
+        model_params = nnx.state(model_for_loading)
+        logger.warning(
+            f"loaded_to_standardized_keys: {self._loaded_to_standardized_keys}"
+        )
+        cumulative_global_memory = 0
+        cumulative_local_memory = 0
+        mlp_experts_gate_proj_weights = {}
+        mlp_experts_gate_proj_scales = {}
+        mlp_experts_up_proj_weights = {}
+        mlp_experts_up_proj_scales = {}
+        mlp_experts_down_proj_weights = {}
+        mlp_experts_down_proj_scales = {}
+        quantized_weights = {}
+        quantized_scales = {}
+        with jax.default_device(jax.devices("cpu")[0]):
+            for loaded_name, loaded_weight in self.names_and_weights_generator:
+                # Skip if the model has fewer layers than original.
+                if re.search(r"layers\.(\d+)", loaded_name):
+                    layer_num = re.search(r"layers\.(\d+)",
+                                          loaded_name).group(1)
+                    if int(layer_num) >= self.num_layers:
+                        del loaded_weight
+                        continue
+                if 'layers.61' in loaded_name:
+                    # skip loading MTP module.
+                    del loaded_weight
+                    continue
+                if re.search(r"experts\.(\d+)", loaded_name):
+                    expert_num = re.search(r"experts\.(\d+)",
+                                           loaded_name).group(1)
+                    if int(expert_num) >= self.num_routed_experts:
+                        del loaded_weight
+                        continue
+                # NOTE: loaded_weight will be a Torch tensor, so we need to convert it to the
+                # equivalent jnp dtype
+                # TODO (jacobplatin): refactor this so that we instead change / update `model_weights_generator`
+                # instead of checking "weight_scale_inv" and assuming quantization method is fp8
+                scale = None
+                if loaded_weight.dtype == j2t_dtype(self.quant_dtype.dtype):
+                    if self.is_model_quantized:
+                        scale_name = loaded_name.replace(
+                            ".weight", ".weight_scale_inv")
+                        if scale_name in quantized_scales:
+                            scale = quantized_scales[scale_name]
+                            del quantized_scales[scale_name]
+                        else:
+                            quantized_weights[loaded_name] = loaded_weight
+                            continue
+                    else:
+                        quantized_weights[loaded_name] = loaded_weight
+                        continue
+                if loaded_name.endswith(".weight_scale_inv"):
+                    if self.is_model_quantized:
+                        weight_name = loaded_name.replace(
+                            ".weight_scale_inv", ".weight")
+                        if weight_name in quantized_weights:
+                            scale = loaded_weight
+                            loaded_weight = quantized_weights[weight_name]
+                            loaded_name = weight_name
+                            del quantized_weights[weight_name]
+                        else:
+                            quantized_scales[loaded_name] = loaded_weight
+                            continue
+                    # In the case that we don't want to use the quantized weights,
+                    # we'll dequantize the weights using the loaded scale on-the-fly
+                    else:
+                        # assuming weights are loaded before scales.
+                        weight_name = loaded_name.replace(
+                            ".weight_scale_inv", ".weight")
+                        loaded_weight = weights_dequant_cpu(
+                            quantized_weights[weight_name], loaded_weight,
+                            self.model_dtype)
+                        loaded_name = weight_name
+                        del quantized_weights[weight_name]
+                # concat mlp.experts weights
+                stacked_scales = None
+                stacked_weights = None
+                if "mlp.experts" in loaded_name:
+                    if "down_proj" in loaded_name:
+                        stacked_weights = self._process_moe_weights(
+                            loaded_name, loaded_weight,
+                            mlp_experts_down_proj_weights)
+                        if scale is not None:
+                            stacked_scales = self._process_moe_weights(
+                                loaded_name, scale,
+                                mlp_experts_down_proj_scales)
+                    if "gate_proj" in loaded_name:
+                        stacked_weights = self._process_moe_weights(
+                            loaded_name, loaded_weight,
+                            mlp_experts_gate_proj_weights)
+                        if scale is not None:
+                            stacked_scales = self._process_moe_weights(
+                                loaded_name, scale,
+                                mlp_experts_gate_proj_scales)
+                    if "up_proj" in loaded_name:
+                        stacked_weights = self._process_moe_weights(
+                            loaded_name, loaded_weight,
+                            mlp_experts_up_proj_weights)
+                        if scale is not None:
+                            stacked_scales = self._process_moe_weights(
+                                loaded_name, scale, mlp_experts_up_proj_scales)
+                    if stacked_weights is not None:
+                        weight_bytes, weight_shards = self._load_individual_weight(
+                            loaded_name,
+                            stacked_weights,
+                            model_params,
+                            model_for_loading.mesh,
+                            scale=stacked_scales)
+                        if self.is_verbose:
+                            cumulative_global_memory += weight_bytes
+                            cumulative_local_memory += weight_shards
+                            logger.info(
+                                f"Cumulative global memory: {cumulative_global_memory} GB"
+                            )
+                            logger.info(
+                                f"Cumulative local memory: {cumulative_local_memory} GB"
+                            )
+                else:
+                    weight_bytes, weight_shards = self._load_individual_weight(
+                        loaded_name,
+                        loaded_weight,
+                        model_params,
+                        model_for_loading.mesh,
+                        scale=scale)
+                    if self.is_verbose:
+                        cumulative_global_memory += weight_bytes
+                        cumulative_local_memory += weight_shards
+                        logger.info(
+                            f"Cumulative global memory: {cumulative_global_memory} GB"
+                        )
+                        logger.info(
+                            f"Cumulative local memory: {cumulative_local_memory} GB"
+                        )
+        del mlp_experts_gate_proj_weights
+        del mlp_experts_up_proj_weights
+        del mlp_experts_down_proj_weights
+        del quantized_weights
+        del quantized_scales
+        # TODO: validate that all of the model_params were accounted for as well.
+        nnx.update(model_for_loading, model_params)
+def weights_dequant_cpu(x: torch.Tensor,
+                        s: torch.Tensor,
+                        output_dtype: jnp.dtype,
+                        block_size: int = 128) -> torch.Tensor:
+    assert x.dim() == 2 and s.dim() == 2, "Both x and s must be 2D tensors"
+    M, N = x.shape
+    x = x.to(torch.float32)
+    s = s.to(torch.float32)
+    y = torch.empty_like(x)
+    M_main = (M // block_size) * block_size
+    N_main = (N // block_size) * block_size
+    if M_main > 0 and N_main > 0:
+        x_main = x[:M_main, :N_main]
+        s_main = s[:(M // block_size), :(N // block_size)]
+        x_reshaped = x_main.view(M // block_size, block_size, N // block_size,
+                                 block_size).permute(0, 2, 1, 3)
+        s_reshaped = s_main.view(M // block_size, N // block_size, 1, 1)
+        y_main = (x_reshaped * s_reshaped).permute(0, 2, 1,
+                                                   3).reshape(M_main, N_main)
+        y[:M_main, :N_main] = y_main
+    if N_main < N:
+        for i in range(0, M_main, block_size):
+            block = x[i:i + block_size, N_main:N]
+            scale = s[i // block_size, N // block_size]
+            y[i:i + block_size, N_main:N] = block * scale
+    if M_main < M:
+        for j in range(0, N, block_size):
+            block = x[M_main:M, j:j + block_size]
+            scale = s[M // block_size, j // block_size]
+            y[M_main:M, j:j + block_size] = block * scale
+    return y.to(j2t_dtype(jnp.dtype(output_dtype)))