sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,691 @@
1
+ # Apache License, Version 2.0:
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ #
14
+ # MIT License:
15
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ # of this software and associated documentation files (the "Software"), to deal
17
+ # in the Software without restriction, including without limitation the rights
18
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ # copies of the Software, and to permit persons to whom the Software is
20
+ # furnished to do so, subject to the following conditions:
21
+ #
22
+ # The above copyright notice and this permission notice shall be included in all
23
+ # copies or substantial portions of the Software.
24
+ #
25
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ # SOFTWARE.
32
+
33
+ import concurrent.futures
34
+ import logging
35
+ import os
36
+ from enum import IntEnum, auto
37
+ from typing import Any, Dict, Iterable, Optional, Tuple, Union
38
+
39
+ import torch
40
+ import torch.nn.functional as F
41
+ from torch import nn
42
+ from tqdm import tqdm
43
+
44
+ from sglang.srt.configs import LongcatFlashConfig
45
+ from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
46
+ from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
47
+ from sglang.srt.layers.dp_attention import (
48
+ get_attention_tp_rank,
49
+ get_attention_tp_size,
50
+ is_dp_attention_enabled,
51
+ )
52
+ from sglang.srt.layers.layernorm import RMSNorm
53
+ from sglang.srt.layers.linear import ReplicatedLinear
54
+ from sglang.srt.layers.logits_processor import LogitsProcessor
55
+ from sglang.srt.layers.quantization import deep_gemm_wrapper
56
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
57
+ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
58
+ from sglang.srt.layers.quantization.fp8_utils import (
59
+ block_quant_dequant,
60
+ block_quant_to_tensor_quant,
61
+ channel_quant_to_tensor_quant,
62
+ normalize_e4m3fn_to_e4m3fnuz,
63
+ requant_weight_ue8m0_inplace,
64
+ )
65
+ from sglang.srt.layers.quantization.int8_utils import (
66
+ block_dequant as int8_block_dequant,
67
+ )
68
+ from sglang.srt.layers.vocab_parallel_embedding import (
69
+ ParallelLMHead,
70
+ VocabParallelEmbedding,
71
+ )
72
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
73
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
74
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
75
+ from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
76
+ from sglang.srt.utils import (
77
+ BumpAllocator,
78
+ LazyValue,
79
+ add_prefix,
80
+ bind_or_assign,
81
+ cpu_has_amx_support,
82
+ get_bool_env_var,
83
+ get_device_sm,
84
+ is_cpu,
85
+ is_cuda,
86
+ is_hip,
87
+ is_npu,
88
+ )
89
+
90
+ _is_hip = is_hip()
91
+ _is_cuda = is_cuda()
92
+ _is_npu = is_npu()
93
+ _is_fp8_fnuz = is_fp8_fnuz()
94
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
95
+ _is_cpu_amx_available = cpu_has_amx_support()
96
+ _is_cpu = is_cpu()
97
+ _device_sm = get_device_sm()
98
+
99
+ if _is_cuda:
100
+ from sgl_kernel import (
101
+ awq_dequantize,
102
+ bmm_fp8,
103
+ dsv3_fused_a_gemm,
104
+ dsv3_router_gemm,
105
+ merge_state_v2,
106
+ )
107
+ elif _is_cpu and _is_cpu_amx_available:
108
+ pass
109
+ elif _is_hip:
110
+ from sglang.srt.layers.quantization.awq_triton import (
111
+ awq_dequantize_triton as awq_dequantize,
112
+ )
113
+ else:
114
+ from vllm._custom_ops import awq_dequantize
115
+
116
+
117
+ logger = logging.getLogger(__name__)
118
+
119
+
120
+ class LongcatFlashDenseDecoderLayer(nn.Module):
121
+
122
+ def __init__(
123
+ self,
124
+ config: LongcatFlashConfig,
125
+ layer_id: int,
126
+ quant_config: Optional[QuantizationConfig] = None,
127
+ prefix: str = "",
128
+ alt_stream: Optional[torch.cuda.Stream] = None,
129
+ ) -> None:
130
+ super().__init__()
131
+ self.config = config
132
+ self.hidden_size = config.hidden_size
133
+ self.layer_id = layer_id
134
+ self.alt_stream = alt_stream
135
+
136
+ self.self_attn = DeepseekV2AttentionMLA(
137
+ config=config,
138
+ hidden_size=config.hidden_size,
139
+ num_heads=config.num_attention_heads,
140
+ qk_nope_head_dim=config.qk_nope_head_dim,
141
+ qk_rope_head_dim=config.qk_rope_head_dim,
142
+ v_head_dim=config.v_head_dim,
143
+ q_lora_rank=config.q_lora_rank,
144
+ kv_lora_rank=config.kv_lora_rank,
145
+ rope_theta=config.rope_theta,
146
+ rope_scaling=None,
147
+ max_position_embeddings=config.max_position_embeddings,
148
+ quant_config=quant_config,
149
+ layer_id=layer_id,
150
+ reduce_results=False,
151
+ prefix=add_prefix(f"self_attn", prefix),
152
+ alt_stream=self.alt_stream,
153
+ )
154
+
155
+ self.mlp = LongcatFlashMLP(
156
+ hidden_size=config.hidden_size,
157
+ intermediate_size=config.intermediate_size,
158
+ hidden_act=config.hidden_act,
159
+ quant_config=quant_config,
160
+ prefix=add_prefix(f"mlps", prefix),
161
+ )
162
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
163
+ self.post_attention_layernorm = RMSNorm(
164
+ config.hidden_size, eps=config.rms_norm_eps
165
+ )
166
+
167
+ self.attn_tp_size = get_attention_tp_size()
168
+ self.attn_tp_rank = get_attention_tp_rank()
169
+ self.layer_scatter_modes = LayerScatterModes.init_new(
170
+ layer_id=self.layer_id,
171
+ num_layers=config.num_hidden_layers,
172
+ is_layer_sparse=False,
173
+ is_previous_layer_sparse=False,
174
+ )
175
+ self.layer_communicator = LayerCommunicator(
176
+ layer_scatter_modes=self.layer_scatter_modes,
177
+ input_layernorm=self.input_layernorm,
178
+ post_attention_layernorm=self.post_attention_layernorm,
179
+ )
180
+
181
+ def forward(
182
+ self,
183
+ positions: torch.Tensor,
184
+ hidden_states: torch.Tensor,
185
+ forward_batch: ForwardBatch,
186
+ residual: Optional[torch.Tensor],
187
+ zero_allocator: BumpAllocator,
188
+ ) -> torch.Tensor:
189
+
190
+ hidden_states, residual = self.layer_communicator.prepare_attn(
191
+ hidden_states, residual, forward_batch
192
+ )
193
+ if hidden_states.shape[0] != 0:
194
+ hidden_states = self.self_attn(
195
+ positions=positions,
196
+ hidden_states=hidden_states,
197
+ forward_batch=forward_batch,
198
+ zero_allocator=zero_allocator,
199
+ )
200
+
201
+ hidden_states, residual = self.layer_communicator.prepare_mlp(
202
+ hidden_states, residual, forward_batch
203
+ )
204
+ hidden_states = self.mlp(hidden_states)
205
+ hidden_states, residual = self.layer_communicator.postprocess_layer(
206
+ hidden_states, residual, forward_batch
207
+ )
208
+ return hidden_states, residual
209
+
210
+
211
+ class LongcatFlashModelNextN(nn.Module):
212
+ def __init__(
213
+ self,
214
+ config: LongcatFlashConfig,
215
+ quant_config: Optional[QuantizationConfig] = None,
216
+ prefix: str = "",
217
+ ) -> None:
218
+ super().__init__()
219
+ self.vocab_size = config.vocab_size
220
+ self.alt_stream = torch.cuda.Stream()
221
+
222
+ self.embed_tokens = VocabParallelEmbedding(
223
+ config.vocab_size,
224
+ config.hidden_size,
225
+ enable_tp=not is_dp_attention_enabled(),
226
+ prefix=add_prefix("embed_tokens", prefix),
227
+ )
228
+
229
+ self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
230
+ self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
231
+
232
+ self.eh_proj = ReplicatedLinear(
233
+ 2 * config.hidden_size,
234
+ config.hidden_size,
235
+ bias=False,
236
+ quant_config=quant_config,
237
+ prefix=add_prefix("eh_proj", ""),
238
+ )
239
+ self.decoder = LongcatFlashDenseDecoderLayer(
240
+ config, 0, quant_config=quant_config, alt_stream=self.alt_stream
241
+ )
242
+
243
+ self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
244
+
245
+ def get_input_embeddings(self) -> torch.Tensor:
246
+ return self.embed_tokens
247
+
248
+ def forward(
249
+ self,
250
+ input_ids: torch.Tensor,
251
+ positions: torch.Tensor,
252
+ forward_batch: ForwardBatch,
253
+ input_embeds: torch.Tensor = None,
254
+ ) -> torch.Tensor:
255
+ total_num_layers = 1
256
+ device = input_embeds.device if input_embeds is not None else input_ids.device
257
+ zero_allocator = BumpAllocator(
258
+ buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
259
+ dtype=torch.float32,
260
+ device=device,
261
+ )
262
+ if input_embeds is None:
263
+ hidden_states = self.embed_tokens(input_ids)
264
+ else:
265
+ hidden_states = input_embeds
266
+
267
+ if hidden_states.shape[0] > 0:
268
+ hidden_states, _ = self.eh_proj(
269
+ torch.cat(
270
+ (
271
+ self.enorm(hidden_states),
272
+ self.hnorm(forward_batch.spec_info.hidden_states),
273
+ ),
274
+ dim=-1,
275
+ )
276
+ )
277
+
278
+ residual = None
279
+ with get_global_expert_distribution_recorder().disable_this_region():
280
+ hidden_states, residual = self.decoder(
281
+ positions, hidden_states, forward_batch, residual, zero_allocator
282
+ )
283
+
284
+ if not forward_batch.forward_mode.is_idle():
285
+ if residual is not None:
286
+ hidden_states, _ = self.final_layernorm(hidden_states, residual)
287
+ else:
288
+ hidden_states = self.final_layernorm(hidden_states)
289
+ return hidden_states
290
+
291
+
292
+ class LongcatFlashForCausalLMNextN(LongcatFlashForCausalLM):
293
+
294
+ def __init__(
295
+ self,
296
+ config: LongcatFlashConfig,
297
+ quant_config: Optional[QuantizationConfig] = None,
298
+ ) -> None:
299
+ nn.Module.__init__(self)
300
+ self.config = config
301
+ self.quant_config = (
302
+ None
303
+ if "mtp" in getattr(config, "disable_quant_module", [])
304
+ else quant_config
305
+ )
306
+ self.model = LongcatFlashModelNextN(config, self.quant_config)
307
+ self.lm_head = ParallelLMHead(
308
+ config.vocab_size,
309
+ config.hidden_size,
310
+ quant_config=self.quant_config,
311
+ )
312
+ self.logits_processor = LogitsProcessor(config)
313
+
314
+ @torch.no_grad()
315
+ def forward(
316
+ self,
317
+ input_ids: torch.Tensor,
318
+ positions: torch.Tensor,
319
+ forward_batch: ForwardBatch,
320
+ ) -> torch.Tensor:
321
+ hidden_states = self.model(input_ids, positions, forward_batch)
322
+ return self.logits_processor(
323
+ input_ids, hidden_states, self.lm_head, forward_batch
324
+ )
325
+
326
+ def post_load_weights(self):
327
+ self_attn = self.model.decoder.self_attn
328
+ if hasattr(self_attn.kv_b_proj, "qweight"):
329
+ # AWQ compatible
330
+ if _is_cuda or _is_hip:
331
+ w = awq_dequantize(
332
+ self_attn.kv_b_proj.qweight,
333
+ self_attn.kv_b_proj.scales,
334
+ self_attn.kv_b_proj.qzeros,
335
+ ).T
336
+ else:
337
+ w = awq_dequantize(
338
+ self_attn.kv_b_proj.qweight,
339
+ self_attn.kv_b_proj.scales,
340
+ self_attn.kv_b_proj.qzeros,
341
+ 0,
342
+ 0,
343
+ 0,
344
+ ).T
345
+ else:
346
+ w = self_attn.kv_b_proj.weight
347
+ # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
348
+ # This may affect the accuracy of fp8 model.
349
+ # Fix deepseek v3 blockwise bmm by using deep_gemm
350
+ use_deep_gemm_bmm = False
351
+ if w.dtype in (
352
+ torch.float8_e4m3fn,
353
+ torch.float8_e4m3fnuz,
354
+ ):
355
+ if (
356
+ hasattr(self.quant_config, "weight_block_size")
357
+ and self.quant_config.weight_block_size is not None
358
+ ):
359
+ weight_block_size = self.quant_config.weight_block_size
360
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
361
+ if _is_fp8_fnuz:
362
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
363
+ weight=w,
364
+ weight_scale=self_attn.kv_b_proj.weight_scale_inv,
365
+ input_scale=None,
366
+ )
367
+ else:
368
+ weight = w
369
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
370
+ if (
371
+ _is_cuda
372
+ and weight_block_size[0] == 128
373
+ and weight_block_size[1] == 128
374
+ ):
375
+ if (
376
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
377
+ and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
378
+ and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
379
+ ):
380
+ block_scale = weight_scale
381
+ use_deep_gemm_bmm = True
382
+ else:
383
+ w = block_quant_dequant(
384
+ weight,
385
+ weight_scale,
386
+ weight_block_size,
387
+ torch.bfloat16,
388
+ )
389
+ else:
390
+ w, scale = block_quant_to_tensor_quant(
391
+ weight, weight_scale, weight_block_size
392
+ )
393
+ self_attn.w_scale = scale
394
+ else:
395
+ if _is_fp8_fnuz:
396
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
397
+ weight=w,
398
+ weight_scale=self_attn.kv_b_proj.weight_scale,
399
+ input_scale=None,
400
+ )
401
+ else:
402
+ weight = w
403
+ weight_scale = self_attn.kv_b_proj.weight_scale
404
+ w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
405
+ self_attn.w_scale = scale
406
+ if w.dtype == torch.int8:
407
+ if hasattr(self.quant_config, "weight_block_size"):
408
+ # block-wise int8 need it
409
+ weight_block_size = self.quant_config.weight_block_size
410
+ if weight_block_size is not None:
411
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
412
+ weight = w
413
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
414
+ w = int8_block_dequant(weight, weight_scale, weight_block_size).to(
415
+ torch.bfloat16
416
+ )
417
+ else:
418
+ # channel-wise int8 need it
419
+ w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
420
+ torch.bfloat16
421
+ )
422
+ w_kc, w_vc = w.unflatten(
423
+ 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
424
+ ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
425
+ if not use_deep_gemm_bmm:
426
+ self_attn.w_kc = bind_or_assign(
427
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
428
+ )
429
+ self_attn.w_vc = bind_or_assign(
430
+ self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
431
+ )
432
+ if (
433
+ hasattr(self_attn.kv_b_proj, "weight_scale")
434
+ and self_attn.w_scale is None
435
+ ):
436
+ self_attn.w_scale = bind_or_assign(
437
+ self_attn.w_scale, self_attn.kv_b_proj.weight_scale
438
+ )
439
+ if _is_hip:
440
+ self_attn.w_scale *= 2.0
441
+ # TODO: remove this after adding FP8 support in bmm cpu kernel
442
+ if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
443
+ self_attn.w_kc = self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
444
+ self_attn.w_vc = self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
445
+ else:
446
+ num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
447
+ num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
448
+ ws_kc, ws_vc = block_scale.unflatten(
449
+ 0, (-1, (num_tiles_k + num_tiles_n))
450
+ ).split([num_tiles_k, num_tiles_n], dim=1)
451
+ self_attn.w_scale_k = bind_or_assign(
452
+ self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
453
+ )
454
+ self_attn.w_scale_v = bind_or_assign(
455
+ self_attn.w_scale_v, ws_vc.contiguous()
456
+ )
457
+ self_attn.w_kc = bind_or_assign(
458
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
459
+ )
460
+ self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
461
+ self_attn.use_deep_gemm_bmm = True
462
+
463
+ if self.config.mla_scale_q_lora:
464
+ self_attn.q_a_layernorm.weight.data *= (
465
+ self.config.hidden_size / self.config.q_lora_rank
466
+ ) ** 0.5
467
+ if self.config.mla_scale_kv_lora:
468
+ self_attn.kv_a_layernorm.weight.data *= (
469
+ self.config.hidden_size / self.config.kv_lora_rank
470
+ ) ** 0.5
471
+
472
+ if (
473
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
474
+ and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
475
+ and hasattr(self.quant_config, "weight_block_size")
476
+ and self.quant_config.weight_block_size is not None
477
+ ):
478
+ self._weight_requant_ue8m0()
479
+
480
+ def _weight_requant_ue8m0(self):
481
+ weight_block_size = self.quant_config.weight_block_size
482
+ layer = self.model.decoder
483
+ for module in [
484
+ layer.self_attn.fused_qkv_a_proj_with_mqa,
485
+ layer.self_attn.q_b_proj,
486
+ layer.self_attn.kv_b_proj,
487
+ layer.self_attn.o_proj,
488
+ ]:
489
+ requant_weight_ue8m0_inplace(
490
+ module.weight, module.weight_scale_inv, weight_block_size
491
+ )
492
+ mlp = layer.mlps
493
+ assert isinstance(mlp, LongcatFlashMLP)
494
+ for module in [
495
+ mlp.gate_up_proj,
496
+ mlp.down_proj,
497
+ ]:
498
+ requant_weight_ue8m0_inplace(
499
+ module.weight, module.weight_scale_inv, weight_block_size
500
+ )
501
+
502
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
503
+ stacked_params_mapping = [
504
+ # (param_name, shard_name, shard_id)
505
+ ("gate_up_proj", "gate_proj", 0),
506
+ ("gate_up_proj", "up_proj", 1),
507
+ ]
508
+
509
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
510
+ fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
511
+ self.config.q_lora_rank is not None
512
+ )
513
+ cached_a_proj = {} if fuse_qkv_a_proj else None
514
+
515
+ nextn_layer_prefix = "model.layers.0"
516
+ nextn_spec_weight_names = [
517
+ "shared_head.norm",
518
+ "eh_proj",
519
+ "enorm",
520
+ "hnorm",
521
+ "final_layernorm",
522
+ ]
523
+
524
+ weight_names_mapping = {
525
+ "model.mtp.embed_tokens.weight": "embed_tokens.weight",
526
+ "model.mtp.layers.0.eh_proj.weight": "eh_proj.weight",
527
+ "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
528
+ "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
529
+ "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
530
+ "model.mtp.layers.0.input_layernorm.weight": "layers.0.input_layernorm.weight",
531
+ "model.mtp.layers.0.post_attention_layernorm.weight": "layers.0.post_attention_layernorm.weight",
532
+ "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "layers.0.self_attn.kv_a_layernorm.weight",
533
+ "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "layers.0.self_attn.kv_a_proj_with_mqa.weight",
534
+ "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
535
+ "model.mtp.layers.0.self_attn.kv_b_proj.weight": "layers.0.self_attn.kv_b_proj.weight",
536
+ "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "layers.0.self_attn.kv_b_proj.weight_scale_inv",
537
+ "model.mtp.layers.0.self_attn.o_proj.weight": "layers.0.self_attn.o_proj.weight",
538
+ "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "layers.0.self_attn.o_proj.weight_scale_inv",
539
+ "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "layers.0.self_attn.q_a_layernorm.weight",
540
+ "model.mtp.layers.0.self_attn.q_a_proj.weight": "layers.0.self_attn.q_a_proj.weight",
541
+ "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "layers.0.self_attn.q_a_proj.weight_scale_inv",
542
+ "model.mtp.layers.0.self_attn.q_b_proj.weight": "layers.0.self_attn.q_b_proj.weight",
543
+ "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "layers.0.self_attn.q_b_proj.weight_scale_inv",
544
+ "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "layers.0.mlp.down_proj.weight",
545
+ "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "layers.0.mlp.down_proj.weight_scale_inv",
546
+ "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "layers.0.mlp.gate_proj.weight",
547
+ "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "layers.0.mlp.gate_proj.weight_scale_inv",
548
+ "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "layers.0.mlp.up_proj.weight",
549
+ "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "layers.0.mlp.up_proj.weight_scale_inv",
550
+ "model.mtp.norm.weight": "layers.0.final_layernorm.weight",
551
+ }
552
+ with concurrent.futures.ThreadPoolExecutor() as executor:
553
+ futures = []
554
+ params_dict = dict(self.named_parameters())
555
+ weight_names = []
556
+ for name, loaded_weight in weights:
557
+ if ".mtp." not in name:
558
+ continue
559
+ if name in weight_names_mapping:
560
+ name = weight_names_mapping[name]
561
+ if name.startswith("layers.0"):
562
+ name = "model." + name
563
+ if (
564
+ name.startswith("enorm")
565
+ or name.startswith("hnorm")
566
+ or name.startswith("eh_proj")
567
+ ):
568
+ name = nextn_layer_prefix + "." + name
569
+ if not name.startswith(nextn_layer_prefix):
570
+ continue
571
+
572
+ # Use shared head and embed weights from target model
573
+ if "shared_head.head" in name or "embed_tokens" in name:
574
+ continue
575
+
576
+ is_decoder = True
577
+ # For nextn specific weights
578
+ for weight_name in nextn_spec_weight_names:
579
+ if weight_name in name:
580
+ name = name.replace(nextn_layer_prefix, "model")
581
+ is_decoder = False
582
+ break
583
+ # For decoder layer weights
584
+ if is_decoder:
585
+ name = name.replace(nextn_layer_prefix, "model.decoder")
586
+
587
+ weight_names.append(name)
588
+ if "rotary_emb.inv_freq" in name:
589
+ continue
590
+ for param_name, weight_name, shard_id in stacked_params_mapping:
591
+ # Skip non-stacked layers and experts (experts handled below).
592
+ if weight_name not in name:
593
+ continue
594
+ # We have mlp.experts[0].gate_proj in the checkpoint.
595
+ # Since we handle the experts below in expert_params_mapping,
596
+ # we need to skip here BEFORE we update the name, otherwise
597
+ # name will be updated to mlp.experts[0].gate_up_proj, which
598
+ # will then be updated below in expert_params_mapping
599
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
600
+ if ("mlp.experts." in name) and name not in params_dict:
601
+ continue
602
+ name = name.replace(weight_name, param_name)
603
+ # Skip loading extra bias for GPTQ models.
604
+ if name.endswith(".bias") and name not in params_dict:
605
+ continue
606
+ param = params_dict[name]
607
+ weight_loader = param.weight_loader
608
+ futures.append(
609
+ executor.submit(weight_loader, param, loaded_weight, shard_id)
610
+ )
611
+ break
612
+ else:
613
+ # Skip loading extra bias for GPTQ models.
614
+ if name.endswith(".bias") and name not in params_dict:
615
+ continue
616
+ if fuse_qkv_a_proj and (
617
+ "q_a_proj" in name or "kv_a_proj_with_mqa" in name
618
+ ):
619
+ cached_a_proj[name] = loaded_weight
620
+ q_a_proj_name = (
621
+ name
622
+ if "q_a_proj" in name
623
+ else name.replace("kv_a_proj_with_mqa", "q_a_proj")
624
+ )
625
+ kv_a_proj_name = (
626
+ name
627
+ if "kv_a_proj_with_mqa" in name
628
+ else name.replace("q_a_proj", "kv_a_proj_with_mqa")
629
+ )
630
+
631
+ # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
632
+ if (
633
+ q_a_proj_name in cached_a_proj
634
+ and kv_a_proj_name in cached_a_proj
635
+ ):
636
+ q_a_proj_weight = cached_a_proj[q_a_proj_name]
637
+ kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
638
+ cat_dim = 0
639
+ if self.quant_config is not None and (
640
+ self.quant_config.get_name() == "awq"
641
+ or self.quant_config.get_name() == "awq_marlin"
642
+ or self.quant_config.get_name() == "moe_wna16"
643
+ ):
644
+ cat_dim = 1
645
+ fused_weight = torch.cat(
646
+ [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
647
+ )
648
+ param_name = (
649
+ name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
650
+ if "q_a_proj" in name
651
+ else name.replace(
652
+ "kv_a_proj_with_mqa",
653
+ "fused_qkv_a_proj_with_mqa",
654
+ )
655
+ )
656
+ param = params_dict[param_name]
657
+
658
+ weight_loader = getattr(
659
+ param, "weight_loader", default_weight_loader
660
+ )
661
+ futures.append(
662
+ executor.submit(weight_loader, param, fused_weight)
663
+ )
664
+ cached_a_proj.pop(q_a_proj_name)
665
+ cached_a_proj.pop(kv_a_proj_name)
666
+ else:
667
+ if (
668
+ "k_scale" in name or "v_scale" in name
669
+ ) and name not in params_dict:
670
+ # modelopt attn kv scale is named differently
671
+ for scale in ["k_scale", "v_scale"]:
672
+ if scale in name:
673
+ name = name.replace(f"{scale[0]}_proj", "attn_mqa")
674
+ break
675
+ if name not in params_dict:
676
+ # modelopt ckpt contains not needed weights for MTP module:
677
+ # model.decoder.self_attn.attn_mqa.v_scale and
678
+ # model.decoder.self_attn.attn_mqa.k_scale
679
+ logger.warning(f"{name} not found in params_dict.")
680
+ continue
681
+ param = params_dict[name]
682
+ weight_loader = getattr(
683
+ param, "weight_loader", default_weight_loader
684
+ )
685
+ futures.append(
686
+ executor.submit(weight_loader, param, loaded_weight)
687
+ )
688
+ self.post_load_weights()
689
+
690
+
691
+ EntryClass = [LongcatFlashForCausalLMNextN]