sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/check_env.py +3 -3
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +15 -0
- sglang/srt/conversation.py +122 -1
- sglang/srt/entrypoints/engine.py +44 -22
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +107 -82
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +1 -1
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +84 -35
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +25 -15
- sglang/srt/managers/scheduler.py +263 -59
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
- sglang/srt/managers/tp_worker.py +51 -16
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +115 -57
- sglang/srt/models/deepseek_nextn.py +1 -257
- sglang/srt/models/deepseek_v2.py +78 -18
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +92 -30
- sglang/srt/models/llama4.py +2 -1
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +0 -12
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/openai_api/adapter.py +34 -22
- sglang/srt/openai_api/protocol.py +11 -1
- sglang/srt/server_args.py +67 -22
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +88 -9
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +29 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -752,7 +752,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
752
752
|
q_nope_out = q_nope_out.transpose(0, 1)
|
753
753
|
|
754
754
|
k_nope = latent_cache[..., : self.kv_lora_rank]
|
755
|
-
k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
|
755
|
+
k_nope = self.kv_a_layernorm(k_nope.contiguous()).unsqueeze(1)
|
756
756
|
k_pe = latent_cache[..., self.kv_lora_rank :].unsqueeze(1)
|
757
757
|
|
758
758
|
q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
|
@@ -1391,6 +1391,9 @@ class DeepseekV2Model(nn.Module):
|
|
1391
1391
|
|
1392
1392
|
self.dp_size = get_attention_dp_size()
|
1393
1393
|
|
1394
|
+
def get_input_embeddings(self) -> torch.Tensor:
|
1395
|
+
return self.embed_tokens
|
1396
|
+
|
1394
1397
|
def forward(
|
1395
1398
|
self,
|
1396
1399
|
input_ids: torch.Tensor,
|
@@ -1502,11 +1505,20 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1502
1505
|
input_ids, hidden_states, self.lm_head, forward_batch
|
1503
1506
|
)
|
1504
1507
|
|
1505
|
-
def post_load_weights(self):
|
1508
|
+
def post_load_weights(self, is_nextn=False):
|
1506
1509
|
|
1507
1510
|
# Perform post-processing after loading weights
|
1508
|
-
|
1509
|
-
|
1511
|
+
layer_ids = (
|
1512
|
+
range(self.config.num_hidden_layers)
|
1513
|
+
if not is_nextn
|
1514
|
+
else [self.config.num_hidden_layers]
|
1515
|
+
)
|
1516
|
+
for layer_id in layer_ids:
|
1517
|
+
self_attn = (
|
1518
|
+
self.model.layers[layer_id].self_attn
|
1519
|
+
if not is_nextn
|
1520
|
+
else self.model.decoder.self_attn
|
1521
|
+
)
|
1510
1522
|
if hasattr(self_attn.kv_b_proj, "qweight"):
|
1511
1523
|
# AWQ compatible
|
1512
1524
|
if _is_cuda:
|
@@ -1612,7 +1624,20 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1612
1624
|
self_attn.w_vc = w_vc.contiguous()
|
1613
1625
|
self_attn.use_deep_gemm_bmm = True
|
1614
1626
|
|
1615
|
-
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
1627
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
|
1628
|
+
if is_nextn:
|
1629
|
+
if hasattr(self.config, "num_nextn_predict_layers"):
|
1630
|
+
num_nextn_layers = self.config.num_nextn_predict_layers
|
1631
|
+
assert num_nextn_layers == 1, "Only 1 nextn layer is supportted"
|
1632
|
+
# compatible with old design
|
1633
|
+
nextn_layer_id = (
|
1634
|
+
0
|
1635
|
+
if self.config.num_hidden_layers == 1
|
1636
|
+
else self.config.num_hidden_layers
|
1637
|
+
)
|
1638
|
+
else:
|
1639
|
+
raise ValueError("num_nextn_predict_layers is not in the config")
|
1640
|
+
|
1616
1641
|
stacked_params_mapping = [
|
1617
1642
|
# (param_name, shard_name, shard_id)
|
1618
1643
|
("gate_up_proj", "gate_proj", 0),
|
@@ -1640,12 +1665,19 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1640
1665
|
"up_proj.weight_scale_inv",
|
1641
1666
|
]
|
1642
1667
|
names_to_remove = []
|
1643
|
-
|
1668
|
+
|
1669
|
+
moe_layers = (
|
1644
1670
|
range(
|
1645
1671
|
self.config.first_k_dense_replace,
|
1646
1672
|
self.config.num_hidden_layers,
|
1647
1673
|
self.config.moe_layer_freq,
|
1648
|
-
)
|
1674
|
+
)
|
1675
|
+
if not is_nextn
|
1676
|
+
else [nextn_layer_id]
|
1677
|
+
)
|
1678
|
+
|
1679
|
+
for moe_layer in tqdm(
|
1680
|
+
moe_layers,
|
1649
1681
|
desc=f"Cloning {self.n_share_experts_fusion} "
|
1650
1682
|
"replicas of the shared expert into MoE",
|
1651
1683
|
):
|
@@ -1686,18 +1718,46 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1686
1718
|
)
|
1687
1719
|
cached_a_proj = {} if fuse_qkv_a_proj else None
|
1688
1720
|
|
1721
|
+
if is_nextn:
|
1722
|
+
nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
|
1723
|
+
nextn_spec_weight_names = [
|
1724
|
+
"shared_head.norm",
|
1725
|
+
"eh_proj",
|
1726
|
+
"enorm",
|
1727
|
+
"hnorm",
|
1728
|
+
]
|
1729
|
+
|
1689
1730
|
params_dict = dict(self.named_parameters())
|
1690
1731
|
for name, loaded_weight in weights:
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1732
|
+
if not is_nextn:
|
1733
|
+
if hasattr(self.config, "num_nextn_predict_layers"):
|
1734
|
+
num_nextn_layers = self.config.num_nextn_predict_layers
|
1735
|
+
if num_nextn_layers > 0 and name.startswith("model.layers"):
|
1736
|
+
name_list = name.split(".")
|
1737
|
+
if (
|
1738
|
+
len(name_list) >= 3
|
1739
|
+
and int(name_list[2]) >= self.config.num_hidden_layers
|
1740
|
+
):
|
1741
|
+
continue
|
1742
|
+
else:
|
1743
|
+
if not name.startswith(nextn_layer_prefix):
|
1744
|
+
continue
|
1745
|
+
|
1746
|
+
# Use shared head and embed weights from target model
|
1747
|
+
if "shared_head.head" in name or "embed_tokens" in name:
|
1748
|
+
continue
|
1749
|
+
|
1750
|
+
is_decoder = True
|
1751
|
+
# For nextn specific weights
|
1752
|
+
for weight_name in nextn_spec_weight_names:
|
1753
|
+
if weight_name in name:
|
1754
|
+
name = name.replace(nextn_layer_prefix, "model")
|
1755
|
+
is_decoder = False
|
1756
|
+
break
|
1757
|
+
# For decoder layer weights
|
1758
|
+
if is_decoder:
|
1759
|
+
name = name.replace(nextn_layer_prefix, "model.decoder")
|
1760
|
+
|
1701
1761
|
if "rotary_emb.inv_freq" in name:
|
1702
1762
|
continue
|
1703
1763
|
for param_name, weight_name, shard_id in stacked_params_mapping:
|
@@ -1786,7 +1846,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1786
1846
|
)
|
1787
1847
|
weight_loader(param, loaded_weight)
|
1788
1848
|
|
1789
|
-
self.post_load_weights()
|
1849
|
+
self.post_load_weights(is_nextn=is_nextn)
|
1790
1850
|
|
1791
1851
|
def get_embed_and_head(self):
|
1792
1852
|
return self.model.embed_tokens.weight, self.lm_head.weight
|
@@ -0,0 +1,308 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# ruff: noqa: E501
|
3
|
+
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
|
4
|
+
# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
|
5
|
+
#
|
6
|
+
# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
|
7
|
+
#
|
8
|
+
# Licensing Information:
|
9
|
+
# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
|
10
|
+
# - Other parts of the code are licensed under the MIT License.
|
11
|
+
#
|
12
|
+
# Apache License, Version 2.0:
|
13
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
14
|
+
# you may not use this file except in compliance with the License.
|
15
|
+
# You may obtain a copy of the License at
|
16
|
+
#
|
17
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
18
|
+
#
|
19
|
+
# Unless required by applicable law or agreed to in writing, software
|
20
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
21
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
22
|
+
# See the License for the specific language governing permissions and
|
23
|
+
# limitations under the License.
|
24
|
+
#
|
25
|
+
# MIT License:
|
26
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
27
|
+
# of this software and associated documentation files (the "Software"), to deal
|
28
|
+
# in the Software without restriction, including without limitation the rights
|
29
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
30
|
+
# copies of the Software, and to permit persons to whom the Software is
|
31
|
+
# furnished to do so, subject to the following conditions:
|
32
|
+
#
|
33
|
+
# The above copyright notice and this permission notice shall be included in all
|
34
|
+
# copies or substantial portions of the Software.
|
35
|
+
#
|
36
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
37
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
38
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
39
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
40
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
41
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
42
|
+
# SOFTWARE.
|
43
|
+
|
44
|
+
import copy
|
45
|
+
import logging
|
46
|
+
import math
|
47
|
+
from collections.abc import Mapping
|
48
|
+
from dataclasses import dataclass
|
49
|
+
from typing import Any, Iterable, List, Optional, Tuple
|
50
|
+
|
51
|
+
import torch
|
52
|
+
from torch import nn
|
53
|
+
from transformers.activations import GELUActivation
|
54
|
+
|
55
|
+
from sglang.srt.configs import KimiVLConfig
|
56
|
+
from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
|
57
|
+
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
58
|
+
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
59
|
+
from sglang.srt.distributed import (
|
60
|
+
get_tensor_model_parallel_rank,
|
61
|
+
get_tensor_model_parallel_world_size,
|
62
|
+
)
|
63
|
+
from sglang.srt.layers.activation import QuickGELU
|
64
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
65
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
66
|
+
from sglang.srt.managers.mm_utils import (
|
67
|
+
MultiModalityDataPaddingPatternMultimodalTokens,
|
68
|
+
general_mm_embed_routine,
|
69
|
+
)
|
70
|
+
from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
|
71
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
72
|
+
from sglang.srt.model_loader.weight_utils import (
|
73
|
+
default_weight_loader,
|
74
|
+
maybe_remap_kv_scale_name,
|
75
|
+
)
|
76
|
+
from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
|
77
|
+
from sglang.srt.models.kimi_vl_moonvit import MoonVitPretrainedModel
|
78
|
+
from sglang.srt.utils import add_prefix
|
79
|
+
|
80
|
+
logger = logging.getLogger(__name__)
|
81
|
+
|
82
|
+
|
83
|
+
# For dummy input only
|
84
|
+
@dataclass
|
85
|
+
class MaxImageTokenMeta:
|
86
|
+
width: int = 1024
|
87
|
+
height: int = 1024
|
88
|
+
|
89
|
+
|
90
|
+
class KimiVLMultiModalProjector(nn.Module):
|
91
|
+
|
92
|
+
def __init__(self, config: KimiVLConfig):
|
93
|
+
super().__init__()
|
94
|
+
|
95
|
+
self.hidden_size = (
|
96
|
+
config.vision_config.hidden_size
|
97
|
+
* config.vision_config.merge_kernel_size[0]
|
98
|
+
* config.vision_config.merge_kernel_size[1]
|
99
|
+
)
|
100
|
+
|
101
|
+
self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5)
|
102
|
+
self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
|
103
|
+
self.act = GELUActivation()
|
104
|
+
self.act = QuickGELU()
|
105
|
+
self.linear_2 = nn.Linear(
|
106
|
+
self.hidden_size, config.text_config.hidden_size, bias=True
|
107
|
+
)
|
108
|
+
|
109
|
+
def forward(self, image_features: torch.Tensor) -> torch.Tensor:
|
110
|
+
hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
|
111
|
+
hidden_states = self.linear_1(hidden_states)
|
112
|
+
hidden_states = self.act(hidden_states)
|
113
|
+
hidden_states = self.linear_2(hidden_states)
|
114
|
+
return hidden_states
|
115
|
+
|
116
|
+
|
117
|
+
class KimiVLForConditionalGeneration(nn.Module):
|
118
|
+
def __init__(
|
119
|
+
self,
|
120
|
+
config: KimiVLConfig,
|
121
|
+
quant_config: Optional[QuantizationConfig] = None,
|
122
|
+
prefix: str = "",
|
123
|
+
**kwargs, # fix init_tts argument error
|
124
|
+
) -> None:
|
125
|
+
super().__init__()
|
126
|
+
self.config = config
|
127
|
+
assert isinstance(config.vision_config, MoonViTConfig)
|
128
|
+
|
129
|
+
self.vision_tower = MoonVitPretrainedModel(config.vision_config)
|
130
|
+
|
131
|
+
self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
|
132
|
+
self.quant_config = quant_config
|
133
|
+
text_config = copy.deepcopy(config.text_config)
|
134
|
+
text_config.architectures = ["DeepseekV2ForCausalLM"]
|
135
|
+
self.language_model = DeepseekV2ForCausalLM(
|
136
|
+
config=text_config,
|
137
|
+
quant_config=quant_config,
|
138
|
+
prefix=add_prefix("language_model", prefix),
|
139
|
+
)
|
140
|
+
|
141
|
+
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
142
|
+
pixel_values = (
|
143
|
+
torch.cat([item.pixel_values for item in items], dim=0)
|
144
|
+
.type(self.vision_tower.dtype)
|
145
|
+
.to(self.vision_tower.device)
|
146
|
+
)
|
147
|
+
image_grid_thws = torch.concat(
|
148
|
+
[item.image_grid_thws for item in items], dim=0
|
149
|
+
).to(self.vision_tower.device)
|
150
|
+
image_features = self.vision_tower(pixel_values, image_grid_thws)
|
151
|
+
assert isinstance(image_features, list)
|
152
|
+
# lengths = [x.shape[0] for x in image_features]
|
153
|
+
res = self.multi_modal_projector(torch.cat(image_features)) # .split(lengths)
|
154
|
+
return res
|
155
|
+
|
156
|
+
def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
|
157
|
+
# Get all special token IDs
|
158
|
+
pattern = MultiModalityDataPaddingPatternMultimodalTokens(mm_inputs.im_token_id)
|
159
|
+
return pattern.pad_input_tokens(input_ids, mm_inputs)
|
160
|
+
|
161
|
+
def forward(
|
162
|
+
self,
|
163
|
+
input_ids: torch.Tensor,
|
164
|
+
positions: torch.Tensor,
|
165
|
+
forward_batch: ForwardBatch,
|
166
|
+
get_embedding: bool = False,
|
167
|
+
):
|
168
|
+
hidden_states = general_mm_embed_routine(
|
169
|
+
input_ids=input_ids,
|
170
|
+
forward_batch=forward_batch,
|
171
|
+
language_model=self.language_model,
|
172
|
+
image_data_embedding_func=self.get_image_feature,
|
173
|
+
positions=positions,
|
174
|
+
)
|
175
|
+
|
176
|
+
return hidden_states
|
177
|
+
|
178
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
179
|
+
config = self.config.text_config
|
180
|
+
_KEYS_TO_MODIFY_MAPPING = {
|
181
|
+
# "language_model.lm_head": "lm_head",
|
182
|
+
# "language_model.model": "language_model",
|
183
|
+
}
|
184
|
+
# only doing this for language model part for now.
|
185
|
+
stacked_params_mapping = [
|
186
|
+
# (param_name, shard_name, shard_id)
|
187
|
+
(".gate_up_proj", ".gate_proj", 0),
|
188
|
+
(".gate_up_proj", ".up_proj", 1),
|
189
|
+
]
|
190
|
+
if not config.use_mla:
|
191
|
+
stacked_params_mapping += [
|
192
|
+
(".qkv_proj", ".q_proj", "q"),
|
193
|
+
(".qkv_proj", ".k_proj", "k"),
|
194
|
+
(".qkv_proj", ".v_proj", "v"),
|
195
|
+
]
|
196
|
+
if getattr(config, "n_routed_experts", None):
|
197
|
+
# Params for weights, fp8 weight scales, fp8 activation scales
|
198
|
+
# (param_name, weight_name, expert_id, shard_id)
|
199
|
+
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
200
|
+
ckpt_gate_proj_name="gate_proj",
|
201
|
+
ckpt_down_proj_name="down_proj",
|
202
|
+
ckpt_up_proj_name="up_proj",
|
203
|
+
num_experts=config.n_routed_experts,
|
204
|
+
)
|
205
|
+
else:
|
206
|
+
expert_params_mapping = []
|
207
|
+
|
208
|
+
params_dict = dict(self.named_parameters())
|
209
|
+
for args in weights:
|
210
|
+
name, loaded_weight = args[:2]
|
211
|
+
kwargs = args[2] if len(args) > 2 else {}
|
212
|
+
if "rotary_emb.inv_freq" in name:
|
213
|
+
continue
|
214
|
+
|
215
|
+
spec_layer = get_spec_layer_idx_from_weight_name(config, name)
|
216
|
+
if spec_layer is not None:
|
217
|
+
continue # skip spec decode layers for main model
|
218
|
+
|
219
|
+
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
220
|
+
# Models trained using ColossalAI may include these tensors in
|
221
|
+
# the checkpoint. Skip them.
|
222
|
+
continue
|
223
|
+
for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
|
224
|
+
if key_to_modify in name:
|
225
|
+
name = name.replace(key_to_modify, new_key)
|
226
|
+
use_default_weight_loading = False
|
227
|
+
if "vision" in name:
|
228
|
+
if self.vision_tower is not None:
|
229
|
+
# We only do sharding for language model and
|
230
|
+
# not vision model for now.
|
231
|
+
use_default_weight_loading = True
|
232
|
+
else:
|
233
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
234
|
+
if weight_name not in name:
|
235
|
+
continue
|
236
|
+
# We have mlp.experts[0].gate_proj in the checkpoint.
|
237
|
+
# Since we handle the experts below in expert_params_mapping,
|
238
|
+
# we need to skip here BEFORE we update the name, otherwise
|
239
|
+
# name will be updated to mlp.experts[0].gate_up_proj, which
|
240
|
+
# will then be updated below in expert_params_mapping
|
241
|
+
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
|
242
|
+
if ("mlp.experts." in name) and name not in params_dict:
|
243
|
+
continue
|
244
|
+
name = name.replace(weight_name, param_name)
|
245
|
+
# Skip loading extra bias for GPTQ models.
|
246
|
+
if name.endswith(".bias") and name not in params_dict:
|
247
|
+
continue
|
248
|
+
|
249
|
+
param = params_dict[name]
|
250
|
+
weight_loader = param.weight_loader
|
251
|
+
weight_loader(param, loaded_weight, shard_id, **kwargs)
|
252
|
+
break
|
253
|
+
else:
|
254
|
+
for idx, (
|
255
|
+
param_name,
|
256
|
+
weight_name,
|
257
|
+
expert_id,
|
258
|
+
shard_id,
|
259
|
+
) in enumerate(expert_params_mapping):
|
260
|
+
if weight_name not in name:
|
261
|
+
continue
|
262
|
+
name = name.replace(weight_name, param_name)
|
263
|
+
|
264
|
+
param = params_dict[name]
|
265
|
+
weight_loader = param.weight_loader
|
266
|
+
weight_loader(
|
267
|
+
param,
|
268
|
+
loaded_weight,
|
269
|
+
name,
|
270
|
+
expert_id=expert_id,
|
271
|
+
shard_id=shard_id,
|
272
|
+
**kwargs,
|
273
|
+
)
|
274
|
+
break
|
275
|
+
else:
|
276
|
+
use_default_weight_loading = True
|
277
|
+
if use_default_weight_loading:
|
278
|
+
# Skip loading extra bias for GPTQ models.
|
279
|
+
if name.endswith(".bias") and name not in params_dict:
|
280
|
+
continue
|
281
|
+
# Remapping the name of FP8 kv-scale.
|
282
|
+
name = maybe_remap_kv_scale_name(name, params_dict)
|
283
|
+
if name is None:
|
284
|
+
continue
|
285
|
+
|
286
|
+
# if is_pp_missing_parameter(name, self):
|
287
|
+
# continue
|
288
|
+
|
289
|
+
param = params_dict[name]
|
290
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
291
|
+
weight_loader(param, loaded_weight, **kwargs)
|
292
|
+
self.language_model.post_load_weights()
|
293
|
+
|
294
|
+
|
295
|
+
def get_spec_layer_idx_from_weight_name(
|
296
|
+
config: DeepseekV2Config, weight_name: str
|
297
|
+
) -> Optional[int]:
|
298
|
+
if hasattr(config, "num_nextn_predict_layers") and (
|
299
|
+
config.num_nextn_predict_layers > 0
|
300
|
+
):
|
301
|
+
layer_idx = config.num_hidden_layers
|
302
|
+
for i in range(config.num_nextn_predict_layers):
|
303
|
+
if weight_name.startswith(f"model.layers.{layer_idx+i}."):
|
304
|
+
return layer_idx + i
|
305
|
+
return None
|
306
|
+
|
307
|
+
|
308
|
+
EntryClass = [KimiVLForConditionalGeneration]
|