sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +2 -0
  3. sglang/srt/configs/longcat_flash.py +104 -0
  4. sglang/srt/configs/model_config.py +14 -1
  5. sglang/srt/connector/__init__.py +1 -1
  6. sglang/srt/connector/base_connector.py +1 -2
  7. sglang/srt/connector/redis.py +2 -2
  8. sglang/srt/connector/serde/__init__.py +1 -1
  9. sglang/srt/connector/serde/safe_serde.py +4 -3
  10. sglang/srt/disaggregation/ascend/conn.py +75 -0
  11. sglang/srt/disaggregation/launch_lb.py +0 -13
  12. sglang/srt/disaggregation/mini_lb.py +33 -8
  13. sglang/srt/disaggregation/prefill.py +1 -1
  14. sglang/srt/distributed/parallel_state.py +27 -15
  15. sglang/srt/entrypoints/engine.py +19 -12
  16. sglang/srt/entrypoints/http_server.py +174 -34
  17. sglang/srt/entrypoints/openai/protocol.py +60 -0
  18. sglang/srt/eplb/eplb_manager.py +26 -2
  19. sglang/srt/eplb/expert_distribution.py +29 -2
  20. sglang/srt/hf_transformers_utils.py +10 -0
  21. sglang/srt/layers/activation.py +12 -0
  22. sglang/srt/layers/attention/ascend_backend.py +240 -109
  23. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  24. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  25. sglang/srt/layers/layernorm.py +28 -3
  26. sglang/srt/layers/linear.py +3 -2
  27. sglang/srt/layers/logits_processor.py +1 -1
  28. sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
  29. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  30. sglang/srt/layers/moe/ep_moe/layer.py +14 -13
  31. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  32. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
  34. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
  37. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  38. sglang/srt/layers/moe/topk.py +35 -12
  39. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  40. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  41. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  42. sglang/srt/layers/quantization/mxfp4.py +9 -4
  43. sglang/srt/layers/quantization/utils.py +13 -0
  44. sglang/srt/layers/quantization/w4afp8.py +30 -25
  45. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  46. sglang/srt/layers/rotary_embedding.py +28 -1
  47. sglang/srt/layers/sampler.py +29 -5
  48. sglang/srt/managers/cache_controller.py +62 -96
  49. sglang/srt/managers/detokenizer_manager.py +9 -2
  50. sglang/srt/managers/io_struct.py +27 -0
  51. sglang/srt/managers/mm_utils.py +5 -1
  52. sglang/srt/managers/multi_tokenizer_mixin.py +629 -0
  53. sglang/srt/managers/scheduler.py +39 -2
  54. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  55. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  56. sglang/srt/managers/tokenizer_manager.py +86 -39
  57. sglang/srt/mem_cache/chunk_cache.py +1 -1
  58. sglang/srt/mem_cache/hicache_storage.py +20 -3
  59. sglang/srt/mem_cache/hiradix_cache.py +94 -71
  60. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  61. sglang/srt/mem_cache/memory_pool.py +4 -0
  62. sglang/srt/mem_cache/memory_pool_host.py +4 -4
  63. sglang/srt/mem_cache/radix_cache.py +5 -4
  64. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  65. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  66. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -9
  67. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
  68. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  69. sglang/srt/model_executor/model_runner.py +5 -4
  70. sglang/srt/model_loader/loader.py +15 -24
  71. sglang/srt/model_loader/utils.py +12 -0
  72. sglang/srt/models/deepseek_v2.py +31 -10
  73. sglang/srt/models/gpt_oss.py +5 -18
  74. sglang/srt/models/llama_eagle3.py +4 -0
  75. sglang/srt/models/longcat_flash.py +1026 -0
  76. sglang/srt/models/longcat_flash_nextn.py +699 -0
  77. sglang/srt/models/qwen2.py +26 -3
  78. sglang/srt/models/qwen2_5_vl.py +65 -41
  79. sglang/srt/models/qwen2_moe.py +22 -2
  80. sglang/srt/models/transformers.py +1 -1
  81. sglang/srt/multimodal/processors/base_processor.py +4 -2
  82. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  83. sglang/srt/server_args.py +112 -55
  84. sglang/srt/speculative/eagle_worker.py +28 -8
  85. sglang/srt/utils.py +4 -0
  86. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  87. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  88. sglang/version.py +1 -1
  89. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +5 -5
  90. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +93 -85
  91. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
  92. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
  93. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,699 @@
1
+ # Apache License, Version 2.0:
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ #
14
+ # MIT License:
15
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ # of this software and associated documentation files (the "Software"), to deal
17
+ # in the Software without restriction, including without limitation the rights
18
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ # copies of the Software, and to permit persons to whom the Software is
20
+ # furnished to do so, subject to the following conditions:
21
+ #
22
+ # The above copyright notice and this permission notice shall be included in all
23
+ # copies or substantial portions of the Software.
24
+ #
25
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ # SOFTWARE.
32
+
33
+ import concurrent.futures
34
+ import logging
35
+ import os
36
+ from enum import IntEnum, auto
37
+ from typing import Any, Dict, Iterable, Optional, Tuple, Union
38
+
39
+ import torch
40
+ import torch.nn.functional as F
41
+ from torch import nn
42
+ from tqdm import tqdm
43
+
44
+ from sglang.srt.configs import LongcatFlashConfig
45
+ from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
46
+ from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
47
+ from sglang.srt.layers.dp_attention import (
48
+ get_attention_tp_rank,
49
+ get_attention_tp_size,
50
+ is_dp_attention_enabled,
51
+ )
52
+ from sglang.srt.layers.layernorm import RMSNorm
53
+ from sglang.srt.layers.linear import ReplicatedLinear
54
+ from sglang.srt.layers.logits_processor import LogitsProcessor
55
+ from sglang.srt.layers.quantization import deep_gemm_wrapper
56
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
57
+ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
58
+ from sglang.srt.layers.quantization.fp8_utils import (
59
+ block_quant_dequant,
60
+ block_quant_to_tensor_quant,
61
+ channel_quant_to_tensor_quant,
62
+ normalize_e4m3fn_to_e4m3fnuz,
63
+ requant_weight_ue8m0_inplace,
64
+ )
65
+ from sglang.srt.layers.quantization.int8_utils import (
66
+ block_dequant as int8_block_dequant,
67
+ )
68
+ from sglang.srt.layers.vocab_parallel_embedding import (
69
+ ParallelLMHead,
70
+ VocabParallelEmbedding,
71
+ )
72
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
73
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
74
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
75
+ from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
76
+ from sglang.srt.utils import (
77
+ BumpAllocator,
78
+ LazyValue,
79
+ add_prefix,
80
+ bind_or_assign,
81
+ cpu_has_amx_support,
82
+ get_bool_env_var,
83
+ get_device_sm,
84
+ is_cpu,
85
+ is_cuda,
86
+ is_hip,
87
+ is_npu,
88
+ )
89
+
90
+ _is_hip = is_hip()
91
+ _is_cuda = is_cuda()
92
+ _is_npu = is_npu()
93
+ _is_fp8_fnuz = is_fp8_fnuz()
94
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
95
+ _is_cpu_amx_available = cpu_has_amx_support()
96
+ _is_cpu = is_cpu()
97
+ _device_sm = get_device_sm()
98
+
99
+ if _is_cuda:
100
+ from sgl_kernel import (
101
+ awq_dequantize,
102
+ bmm_fp8,
103
+ dsv3_fused_a_gemm,
104
+ dsv3_router_gemm,
105
+ merge_state_v2,
106
+ )
107
+ elif _is_cpu and _is_cpu_amx_available:
108
+ pass
109
+ elif _is_hip:
110
+ from sglang.srt.layers.quantization.awq_triton import (
111
+ awq_dequantize_triton as awq_dequantize,
112
+ )
113
+ else:
114
+ from vllm._custom_ops import awq_dequantize
115
+
116
+
117
+ logger = logging.getLogger(__name__)
118
+
119
+
120
+ class LongcatFlashDenseDecoderLayer(nn.Module):
121
+
122
+ def __init__(
123
+ self,
124
+ config: LongcatFlashConfig,
125
+ layer_id: int,
126
+ quant_config: Optional[QuantizationConfig] = None,
127
+ prefix: str = "",
128
+ alt_stream: Optional[torch.cuda.Stream] = None,
129
+ ) -> None:
130
+ super().__init__()
131
+ self.config = config
132
+ self.hidden_size = config.hidden_size
133
+ self.layer_id = layer_id
134
+ self.alt_stream = alt_stream
135
+
136
+ self.self_attn = DeepseekV2AttentionMLA(
137
+ config=config,
138
+ hidden_size=config.hidden_size,
139
+ num_heads=config.num_attention_heads,
140
+ qk_nope_head_dim=config.qk_nope_head_dim,
141
+ qk_rope_head_dim=config.qk_rope_head_dim,
142
+ v_head_dim=config.v_head_dim,
143
+ q_lora_rank=config.q_lora_rank,
144
+ kv_lora_rank=config.kv_lora_rank,
145
+ rope_theta=config.rope_theta,
146
+ rope_scaling=None,
147
+ max_position_embeddings=config.max_position_embeddings,
148
+ quant_config=quant_config,
149
+ layer_id=layer_id,
150
+ reduce_results=False,
151
+ prefix=add_prefix(f"self_attn", prefix),
152
+ alt_stream=self.alt_stream,
153
+ )
154
+
155
+ self.mlp = LongcatFlashMLP(
156
+ hidden_size=config.hidden_size,
157
+ intermediate_size=config.intermediate_size,
158
+ hidden_act=config.hidden_act,
159
+ quant_config=quant_config,
160
+ prefix=add_prefix(f"mlps", prefix),
161
+ )
162
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
163
+ self.post_attention_layernorm = RMSNorm(
164
+ config.hidden_size, eps=config.rms_norm_eps
165
+ )
166
+
167
+ self.attn_tp_size = get_attention_tp_size()
168
+ self.attn_tp_rank = get_attention_tp_rank()
169
+ self.layer_scatter_modes = LayerScatterModes.init_new(
170
+ layer_id=self.layer_id,
171
+ num_layers=config.num_hidden_layers,
172
+ is_layer_sparse=False,
173
+ is_previous_layer_sparse=False,
174
+ )
175
+ self.layer_communicator = LayerCommunicator(
176
+ layer_scatter_modes=self.layer_scatter_modes,
177
+ input_layernorm=self.input_layernorm,
178
+ post_attention_layernorm=self.post_attention_layernorm,
179
+ )
180
+
181
+ def forward(
182
+ self,
183
+ positions: torch.Tensor,
184
+ hidden_states: torch.Tensor,
185
+ forward_batch: ForwardBatch,
186
+ residual: Optional[torch.Tensor],
187
+ zero_allocator: BumpAllocator,
188
+ ) -> torch.Tensor:
189
+
190
+ hidden_states, residual = self.layer_communicator.prepare_attn(
191
+ hidden_states, residual, forward_batch
192
+ )
193
+ if hidden_states.shape[0] != 0:
194
+ hidden_states = self.self_attn(
195
+ positions=positions,
196
+ hidden_states=hidden_states,
197
+ forward_batch=forward_batch,
198
+ zero_allocator=zero_allocator,
199
+ )
200
+
201
+ hidden_states, residual = self.layer_communicator.prepare_mlp(
202
+ hidden_states, residual, forward_batch
203
+ )
204
+ hidden_states = self.mlp(hidden_states)
205
+ hidden_states, residual = self.layer_communicator.postprocess_layer(
206
+ hidden_states, residual, forward_batch
207
+ )
208
+ return hidden_states, residual
209
+
210
+
211
+ class LongcatFlashModelNextN(nn.Module):
212
+ def __init__(
213
+ self,
214
+ config: LongcatFlashConfig,
215
+ quant_config: Optional[QuantizationConfig] = None,
216
+ prefix: str = "",
217
+ ) -> None:
218
+ super().__init__()
219
+ self.vocab_size = config.vocab_size
220
+ self.alt_stream = torch.cuda.Stream()
221
+
222
+ self.embed_tokens = VocabParallelEmbedding(
223
+ config.vocab_size,
224
+ config.hidden_size,
225
+ enable_tp=not is_dp_attention_enabled(),
226
+ prefix=add_prefix("embed_tokens", prefix),
227
+ )
228
+
229
+ self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
230
+ self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
231
+
232
+ self.eh_proj = ReplicatedLinear(
233
+ 2 * config.hidden_size,
234
+ config.hidden_size,
235
+ bias=False,
236
+ quant_config=quant_config,
237
+ prefix=add_prefix("eh_proj", ""),
238
+ )
239
+ self.decoder = LongcatFlashDenseDecoderLayer(
240
+ config, 0, quant_config=quant_config, alt_stream=self.alt_stream
241
+ )
242
+
243
+ self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
244
+
245
+ def get_input_embeddings(self) -> torch.Tensor:
246
+ return self.embed_tokens
247
+
248
+ def forward(
249
+ self,
250
+ input_ids: torch.Tensor,
251
+ positions: torch.Tensor,
252
+ forward_batch: ForwardBatch,
253
+ input_embeds: torch.Tensor = None,
254
+ ) -> torch.Tensor:
255
+ total_num_layers = 1
256
+ device = input_embeds.device if input_embeds is not None else input_ids.device
257
+ zero_allocator = BumpAllocator(
258
+ buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
259
+ dtype=torch.float32,
260
+ device=device,
261
+ )
262
+ if input_embeds is None:
263
+ hidden_states = self.embed_tokens(input_ids)
264
+ else:
265
+ hidden_states = input_embeds
266
+
267
+ if hidden_states.shape[0] > 0:
268
+ hidden_states, _ = self.eh_proj(
269
+ torch.cat(
270
+ (
271
+ self.enorm(hidden_states),
272
+ self.hnorm(forward_batch.spec_info.hidden_states),
273
+ ),
274
+ dim=-1,
275
+ )
276
+ )
277
+
278
+ residual = None
279
+ with get_global_expert_distribution_recorder().disable_this_region():
280
+ hidden_states, residual = self.decoder(
281
+ positions, hidden_states, forward_batch, residual, zero_allocator
282
+ )
283
+
284
+ if not forward_batch.forward_mode.is_idle():
285
+ if residual is not None:
286
+ hidden_states, _ = self.final_layernorm(hidden_states, residual)
287
+ else:
288
+ hidden_states = self.final_layernorm(hidden_states)
289
+ return hidden_states
290
+
291
+
292
+ class LongcatFlashForCausalLMNextN(LongcatFlashForCausalLM):
293
+
294
+ def __init__(
295
+ self,
296
+ config: LongcatFlashConfig,
297
+ quant_config: Optional[QuantizationConfig] = None,
298
+ ) -> None:
299
+ nn.Module.__init__(self)
300
+ self.config = config
301
+ self.quant_config = (
302
+ None
303
+ if "mtp" in getattr(config, "disable_quant_module", [])
304
+ else quant_config
305
+ )
306
+ self.model = LongcatFlashModelNextN(config, self.quant_config)
307
+ self.lm_head = ParallelLMHead(
308
+ config.vocab_size,
309
+ config.hidden_size,
310
+ quant_config=self.quant_config,
311
+ )
312
+ self.logits_processor = LogitsProcessor(config)
313
+
314
+ @torch.no_grad()
315
+ def forward(
316
+ self,
317
+ input_ids: torch.Tensor,
318
+ positions: torch.Tensor,
319
+ forward_batch: ForwardBatch,
320
+ ) -> torch.Tensor:
321
+ hidden_states = self.model(input_ids, positions, forward_batch)
322
+ return self.logits_processor(
323
+ input_ids, hidden_states, self.lm_head, forward_batch
324
+ )
325
+
326
+ def post_load_weights(self):
327
+ self_attn = self.model.decoder.self_attn
328
+ if hasattr(self_attn.kv_b_proj, "qweight"):
329
+ # AWQ compatible
330
+ if _is_cuda or _is_hip:
331
+ w = awq_dequantize(
332
+ self_attn.kv_b_proj.qweight,
333
+ self_attn.kv_b_proj.scales,
334
+ self_attn.kv_b_proj.qzeros,
335
+ ).T
336
+ else:
337
+ w = awq_dequantize(
338
+ self_attn.kv_b_proj.qweight,
339
+ self_attn.kv_b_proj.scales,
340
+ self_attn.kv_b_proj.qzeros,
341
+ 0,
342
+ 0,
343
+ 0,
344
+ ).T
345
+ else:
346
+ w = self_attn.kv_b_proj.weight
347
+ use_deep_gemm_bmm = False
348
+ if w.dtype in (
349
+ torch.float8_e4m3fn,
350
+ torch.float8_e4m3fnuz,
351
+ ):
352
+ if (
353
+ hasattr(self.quant_config, "weight_block_size")
354
+ and self.quant_config.weight_block_size is not None
355
+ ):
356
+ weight_block_size = self.quant_config.weight_block_size
357
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
358
+ if _is_fp8_fnuz:
359
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
360
+ weight=w,
361
+ weight_scale=self_attn.kv_b_proj.weight_scale_inv,
362
+ input_scale=None,
363
+ )
364
+ else:
365
+ weight = w
366
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
367
+ if (
368
+ _is_cuda
369
+ and weight_block_size[0] == 128
370
+ and weight_block_size[1] == 128
371
+ ):
372
+ if (
373
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
374
+ and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
375
+ and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
376
+ ):
377
+ block_scale = weight_scale
378
+ use_deep_gemm_bmm = True
379
+ else:
380
+ w = block_quant_dequant(
381
+ weight,
382
+ weight_scale,
383
+ weight_block_size,
384
+ torch.bfloat16,
385
+ )
386
+ else:
387
+ w, scale = block_quant_to_tensor_quant(
388
+ weight, weight_scale, weight_block_size
389
+ )
390
+ self_attn.w_scale = scale
391
+ else:
392
+ if _is_fp8_fnuz:
393
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
394
+ weight=w,
395
+ weight_scale=self_attn.kv_b_proj.weight_scale,
396
+ input_scale=None,
397
+ )
398
+ else:
399
+ weight = w
400
+ weight_scale = self_attn.kv_b_proj.weight_scale
401
+ w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
402
+ self_attn.w_scale = scale
403
+ if w.dtype == torch.int8:
404
+ if hasattr(self.quant_config, "weight_block_size"):
405
+ # block-wise int8 need it
406
+ weight_block_size = self.quant_config.weight_block_size
407
+ if weight_block_size is not None:
408
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
409
+ weight = w
410
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
411
+ w = int8_block_dequant(weight, weight_scale, weight_block_size).to(
412
+ torch.bfloat16
413
+ )
414
+ else:
415
+ # channel-wise int8 need it
416
+ w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
417
+ torch.bfloat16
418
+ )
419
+ w_kc, w_vc = w.unflatten(
420
+ 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
421
+ ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
422
+ if not use_deep_gemm_bmm:
423
+ self_attn.w_kc = bind_or_assign(
424
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
425
+ )
426
+ self_attn.w_vc = bind_or_assign(
427
+ self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
428
+ )
429
+ if (
430
+ hasattr(self_attn.kv_b_proj, "weight_scale")
431
+ and self_attn.w_scale is None
432
+ ):
433
+ self_attn.w_scale = bind_or_assign(
434
+ self_attn.w_scale, self_attn.kv_b_proj.weight_scale
435
+ )
436
+ if _is_hip:
437
+ self_attn.w_scale *= 2.0
438
+ # TODO: remove this after adding FP8 support in bmm cpu kernel
439
+ if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
440
+ self_attn.w_kc = self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
441
+ self_attn.w_vc = self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
442
+ else:
443
+ num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
444
+ num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
445
+ ws_kc, ws_vc = block_scale.unflatten(
446
+ 0, (-1, (num_tiles_k + num_tiles_n))
447
+ ).split([num_tiles_k, num_tiles_n], dim=1)
448
+ self_attn.w_scale_k = bind_or_assign(
449
+ self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
450
+ )
451
+ self_attn.w_scale_v = bind_or_assign(
452
+ self_attn.w_scale_v, ws_vc.contiguous()
453
+ )
454
+ self_attn.w_kc = bind_or_assign(
455
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
456
+ )
457
+ self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
458
+ self_attn.use_deep_gemm_bmm = True
459
+
460
+ if self.config.mla_scale_q_lora:
461
+ self_attn.q_a_layernorm.weight.data *= (
462
+ self.config.hidden_size / self.config.q_lora_rank
463
+ ) ** 0.5
464
+ if self.config.mla_scale_kv_lora:
465
+ self_attn.kv_a_layernorm.weight.data *= (
466
+ self.config.hidden_size / self.config.kv_lora_rank
467
+ ) ** 0.5
468
+
469
+ if (
470
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
471
+ and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
472
+ and hasattr(self.quant_config, "weight_block_size")
473
+ and self.quant_config.weight_block_size is not None
474
+ ):
475
+ self._weight_requant_ue8m0()
476
+
477
+ def _weight_requant_ue8m0(self):
478
+ weight_block_size = self.quant_config.weight_block_size
479
+ layer = self.model.decoder
480
+ self_attn = layer.self_attn
481
+ module_list = [
482
+ self_attn.kv_b_proj,
483
+ self_attn.o_proj,
484
+ ]
485
+
486
+ if self.config.q_lora_rank is not None:
487
+ module_list.append(self_attn.fused_qkv_a_proj_with_mqa)
488
+ module_list.append(self_attn.q_b_proj)
489
+ else:
490
+ module_list.append(self_attn.kv_a_proj_with_mqa)
491
+ module_list.append(self_attn.q_proj)
492
+
493
+ for module in module_list:
494
+ if hasattr(module, "weight_scale_inv"):
495
+ requant_weight_ue8m0_inplace(
496
+ module.weight, module.weight_scale_inv, weight_block_size
497
+ )
498
+
499
+ mlp = layer.mlps
500
+ assert isinstance(mlp, LongcatFlashMLP)
501
+ for module in [
502
+ mlp.gate_up_proj,
503
+ mlp.down_proj,
504
+ ]:
505
+ if hasattr(module, "weight_scale_inv"):
506
+ requant_weight_ue8m0_inplace(
507
+ module.weight, module.weight_scale_inv, weight_block_size
508
+ )
509
+
510
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
511
+ stacked_params_mapping = [
512
+ # (param_name, shard_name, shard_id)
513
+ ("gate_up_proj", "gate_proj", 0),
514
+ ("gate_up_proj", "up_proj", 1),
515
+ ]
516
+
517
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
518
+ fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
519
+ self.config.q_lora_rank is not None
520
+ )
521
+ cached_a_proj = {} if fuse_qkv_a_proj else None
522
+
523
+ nextn_layer_prefix = "model.layers.0"
524
+ nextn_spec_weight_names = [
525
+ "shared_head.norm",
526
+ "eh_proj",
527
+ "enorm",
528
+ "hnorm",
529
+ "final_layernorm",
530
+ ]
531
+
532
+ weight_names_mapping = {
533
+ "model.mtp.embed_tokens.weight": "embed_tokens.weight",
534
+ "model.mtp.layers.0.eh_proj.weight": "eh_proj.weight",
535
+ "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
536
+ "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
537
+ "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
538
+ "model.mtp.layers.0.input_layernorm.weight": "layers.0.input_layernorm.weight",
539
+ "model.mtp.layers.0.post_attention_layernorm.weight": "layers.0.post_attention_layernorm.weight",
540
+ "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "layers.0.self_attn.kv_a_layernorm.weight",
541
+ "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "layers.0.self_attn.kv_a_proj_with_mqa.weight",
542
+ "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
543
+ "model.mtp.layers.0.self_attn.kv_b_proj.weight": "layers.0.self_attn.kv_b_proj.weight",
544
+ "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "layers.0.self_attn.kv_b_proj.weight_scale_inv",
545
+ "model.mtp.layers.0.self_attn.o_proj.weight": "layers.0.self_attn.o_proj.weight",
546
+ "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "layers.0.self_attn.o_proj.weight_scale_inv",
547
+ "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "layers.0.self_attn.q_a_layernorm.weight",
548
+ "model.mtp.layers.0.self_attn.q_a_proj.weight": "layers.0.self_attn.q_a_proj.weight",
549
+ "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "layers.0.self_attn.q_a_proj.weight_scale_inv",
550
+ "model.mtp.layers.0.self_attn.q_b_proj.weight": "layers.0.self_attn.q_b_proj.weight",
551
+ "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "layers.0.self_attn.q_b_proj.weight_scale_inv",
552
+ "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "layers.0.mlp.down_proj.weight",
553
+ "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "layers.0.mlp.down_proj.weight_scale_inv",
554
+ "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "layers.0.mlp.gate_proj.weight",
555
+ "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "layers.0.mlp.gate_proj.weight_scale_inv",
556
+ "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "layers.0.mlp.up_proj.weight",
557
+ "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "layers.0.mlp.up_proj.weight_scale_inv",
558
+ "model.mtp.norm.weight": "layers.0.final_layernorm.weight",
559
+ }
560
+ with concurrent.futures.ThreadPoolExecutor() as executor:
561
+ futures = []
562
+ params_dict = dict(self.named_parameters())
563
+ weight_names = []
564
+ for name, loaded_weight in weights:
565
+ if ".mtp." not in name:
566
+ continue
567
+ if name in weight_names_mapping:
568
+ name = weight_names_mapping[name]
569
+ if name.startswith("layers.0"):
570
+ name = "model." + name
571
+ if (
572
+ name.startswith("enorm")
573
+ or name.startswith("hnorm")
574
+ or name.startswith("eh_proj")
575
+ ):
576
+ name = nextn_layer_prefix + "." + name
577
+ if not name.startswith(nextn_layer_prefix):
578
+ continue
579
+
580
+ # Use shared head and embed weights from target model
581
+ if "shared_head.head" in name or "embed_tokens" in name:
582
+ continue
583
+
584
+ is_decoder = True
585
+ # For nextn specific weights
586
+ for weight_name in nextn_spec_weight_names:
587
+ if weight_name in name:
588
+ name = name.replace(nextn_layer_prefix, "model")
589
+ is_decoder = False
590
+ break
591
+ # For decoder layer weights
592
+ if is_decoder:
593
+ name = name.replace(nextn_layer_prefix, "model.decoder")
594
+
595
+ weight_names.append(name)
596
+ if "rotary_emb.inv_freq" in name:
597
+ continue
598
+ for param_name, weight_name, shard_id in stacked_params_mapping:
599
+ # Skip non-stacked layers and experts (experts handled below).
600
+ if weight_name not in name:
601
+ continue
602
+ # We have mlp.experts[0].gate_proj in the checkpoint.
603
+ # Since we handle the experts below in expert_params_mapping,
604
+ # we need to skip here BEFORE we update the name, otherwise
605
+ # name will be updated to mlp.experts[0].gate_up_proj, which
606
+ # will then be updated below in expert_params_mapping
607
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
608
+ if ("mlp.experts." in name) and name not in params_dict:
609
+ continue
610
+ name = name.replace(weight_name, param_name)
611
+ # Skip loading extra bias for GPTQ models.
612
+ if name.endswith(".bias") and name not in params_dict:
613
+ continue
614
+ param = params_dict[name]
615
+ weight_loader = param.weight_loader
616
+ futures.append(
617
+ executor.submit(weight_loader, param, loaded_weight, shard_id)
618
+ )
619
+ break
620
+ else:
621
+ # Skip loading extra bias for GPTQ models.
622
+ if name.endswith(".bias") and name not in params_dict:
623
+ continue
624
+ if fuse_qkv_a_proj and (
625
+ "q_a_proj" in name or "kv_a_proj_with_mqa" in name
626
+ ):
627
+ cached_a_proj[name] = loaded_weight
628
+ q_a_proj_name = (
629
+ name
630
+ if "q_a_proj" in name
631
+ else name.replace("kv_a_proj_with_mqa", "q_a_proj")
632
+ )
633
+ kv_a_proj_name = (
634
+ name
635
+ if "kv_a_proj_with_mqa" in name
636
+ else name.replace("q_a_proj", "kv_a_proj_with_mqa")
637
+ )
638
+
639
+ # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
640
+ if (
641
+ q_a_proj_name in cached_a_proj
642
+ and kv_a_proj_name in cached_a_proj
643
+ ):
644
+ q_a_proj_weight = cached_a_proj[q_a_proj_name]
645
+ kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
646
+ cat_dim = 0
647
+ if self.quant_config is not None and (
648
+ self.quant_config.get_name() == "awq"
649
+ or self.quant_config.get_name() == "awq_marlin"
650
+ or self.quant_config.get_name() == "moe_wna16"
651
+ ):
652
+ cat_dim = 1
653
+ fused_weight = torch.cat(
654
+ [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
655
+ )
656
+ param_name = (
657
+ name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
658
+ if "q_a_proj" in name
659
+ else name.replace(
660
+ "kv_a_proj_with_mqa",
661
+ "fused_qkv_a_proj_with_mqa",
662
+ )
663
+ )
664
+ param = params_dict[param_name]
665
+
666
+ weight_loader = getattr(
667
+ param, "weight_loader", default_weight_loader
668
+ )
669
+ futures.append(
670
+ executor.submit(weight_loader, param, fused_weight)
671
+ )
672
+ cached_a_proj.pop(q_a_proj_name)
673
+ cached_a_proj.pop(kv_a_proj_name)
674
+ else:
675
+ if (
676
+ "k_scale" in name or "v_scale" in name
677
+ ) and name not in params_dict:
678
+ # modelopt attn kv scale is named differently
679
+ for scale in ["k_scale", "v_scale"]:
680
+ if scale in name:
681
+ name = name.replace(f"{scale[0]}_proj", "attn_mqa")
682
+ break
683
+ if name not in params_dict:
684
+ # modelopt ckpt contains not needed weights for MTP module:
685
+ # model.decoder.self_attn.attn_mqa.v_scale and
686
+ # model.decoder.self_attn.attn_mqa.k_scale
687
+ logger.warning(f"{name} not found in params_dict.")
688
+ continue
689
+ param = params_dict[name]
690
+ weight_loader = getattr(
691
+ param, "weight_loader", default_weight_loader
692
+ )
693
+ futures.append(
694
+ executor.submit(weight_loader, param, loaded_weight)
695
+ )
696
+ self.post_load_weights()
697
+
698
+
699
+ EntryClass = [LongcatFlashForCausalLMNextN]