sglang 0.5.1.post2__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +79 -53
  3. sglang/bench_serving.py +186 -14
  4. sglang/profiler.py +0 -1
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/longcat_flash.py +104 -0
  7. sglang/srt/configs/model_config.py +12 -0
  8. sglang/srt/connector/__init__.py +1 -1
  9. sglang/srt/connector/base_connector.py +1 -2
  10. sglang/srt/connector/redis.py +2 -2
  11. sglang/srt/connector/serde/__init__.py +1 -1
  12. sglang/srt/connector/serde/safe_serde.py +4 -3
  13. sglang/srt/conversation.py +38 -5
  14. sglang/srt/disaggregation/ascend/conn.py +75 -0
  15. sglang/srt/disaggregation/launch_lb.py +0 -13
  16. sglang/srt/disaggregation/mini_lb.py +33 -8
  17. sglang/srt/disaggregation/prefill.py +1 -1
  18. sglang/srt/distributed/parallel_state.py +24 -14
  19. sglang/srt/entrypoints/engine.py +19 -12
  20. sglang/srt/entrypoints/http_server.py +174 -34
  21. sglang/srt/entrypoints/openai/protocol.py +87 -24
  22. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  23. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  24. sglang/srt/eplb/eplb_manager.py +26 -2
  25. sglang/srt/eplb/expert_distribution.py +29 -2
  26. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  27. sglang/srt/function_call/function_call_parser.py +2 -0
  28. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  29. sglang/srt/harmony_parser.py +588 -0
  30. sglang/srt/hf_transformers_utils.py +26 -7
  31. sglang/srt/layers/activation.py +12 -0
  32. sglang/srt/layers/attention/ascend_backend.py +374 -136
  33. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  34. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  35. sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
  36. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  38. sglang/srt/layers/communicator.py +1 -2
  39. sglang/srt/layers/layernorm.py +28 -3
  40. sglang/srt/layers/linear.py +3 -2
  41. sglang/srt/layers/logits_processor.py +1 -1
  42. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  43. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  44. sglang/srt/layers/moe/ep_moe/layer.py +13 -13
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  47. sglang/srt/layers/moe/topk.py +35 -12
  48. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  49. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  50. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  51. sglang/srt/layers/quantization/fp8.py +2 -1
  52. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  53. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  54. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  55. sglang/srt/layers/quantization/mxfp4.py +25 -27
  56. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  57. sglang/srt/layers/quantization/utils.py +13 -0
  58. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  59. sglang/srt/layers/rotary_embedding.py +28 -1
  60. sglang/srt/layers/sampler.py +29 -5
  61. sglang/srt/layers/utils.py +0 -14
  62. sglang/srt/managers/cache_controller.py +237 -204
  63. sglang/srt/managers/detokenizer_manager.py +48 -2
  64. sglang/srt/managers/io_struct.py +57 -0
  65. sglang/srt/managers/mm_utils.py +5 -1
  66. sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
  67. sglang/srt/managers/scheduler.py +94 -9
  68. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  69. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  70. sglang/srt/managers/tokenizer_manager.py +122 -42
  71. sglang/srt/mem_cache/chunk_cache.py +1 -1
  72. sglang/srt/mem_cache/hicache_storage.py +51 -23
  73. sglang/srt/mem_cache/hiradix_cache.py +87 -71
  74. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  75. sglang/srt/mem_cache/memory_pool.py +77 -14
  76. sglang/srt/mem_cache/memory_pool_host.py +4 -5
  77. sglang/srt/mem_cache/radix_cache.py +6 -4
  78. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  79. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +38 -20
  80. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +87 -82
  81. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  82. sglang/srt/model_executor/model_runner.py +6 -5
  83. sglang/srt/model_loader/loader.py +15 -24
  84. sglang/srt/model_loader/utils.py +12 -0
  85. sglang/srt/models/deepseek_v2.py +38 -13
  86. sglang/srt/models/gpt_oss.py +2 -15
  87. sglang/srt/models/llama_eagle3.py +4 -0
  88. sglang/srt/models/longcat_flash.py +1015 -0
  89. sglang/srt/models/longcat_flash_nextn.py +691 -0
  90. sglang/srt/models/qwen2.py +26 -3
  91. sglang/srt/models/qwen2_5_vl.py +66 -41
  92. sglang/srt/models/qwen2_moe.py +22 -2
  93. sglang/srt/models/transformers.py +1 -1
  94. sglang/srt/multimodal/processors/base_processor.py +4 -2
  95. sglang/srt/reasoning_parser.py +56 -300
  96. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  97. sglang/srt/server_args.py +122 -56
  98. sglang/srt/speculative/eagle_worker.py +28 -8
  99. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  100. sglang/srt/utils.py +73 -5
  101. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  102. sglang/version.py +1 -1
  103. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +7 -6
  104. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +107 -99
  105. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
  106. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.5.1.post2.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1015 @@
1
+ # Apache License, Version 2.0:
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ #
14
+ # MIT License:
15
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ # of this software and associated documentation files (the "Software"), to deal
17
+ # in the Software without restriction, including without limitation the rights
18
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ # copies of the Software, and to permit persons to whom the Software is
20
+ # furnished to do so, subject to the following conditions:
21
+ #
22
+ # The above copyright notice and this permission notice shall be included in all
23
+ # copies or substantial portions of the Software.
24
+ #
25
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ # SOFTWARE.
32
+
33
+ import concurrent.futures
34
+ import logging
35
+ import os
36
+ from enum import IntEnum, auto
37
+ from typing import Any, Dict, Iterable, Optional, Tuple, Union
38
+
39
+ import torch
40
+ import torch.nn.functional as F
41
+ from torch import nn
42
+ from tqdm import tqdm
43
+
44
+ from sglang.srt.configs import LongcatFlashConfig
45
+ from sglang.srt.distributed import (
46
+ get_tensor_model_parallel_world_size,
47
+ tensor_model_parallel_all_reduce,
48
+ )
49
+ from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
50
+ from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
51
+ from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
52
+ from sglang.srt.layers.activation import SiluAndMul
53
+ from sglang.srt.layers.amx_utils import PackWeightMethod
54
+ from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
55
+ from sglang.srt.layers.dp_attention import (
56
+ get_attention_tp_rank,
57
+ get_attention_tp_size,
58
+ is_dp_attention_enabled,
59
+ )
60
+ from sglang.srt.layers.layernorm import RMSNorm
61
+ from sglang.srt.layers.linear import (
62
+ MergedColumnParallelLinear,
63
+ ReplicatedLinear,
64
+ RowParallelLinear,
65
+ )
66
+ from sglang.srt.layers.logits_processor import LogitsProcessor
67
+ from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton
68
+ from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
69
+ from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
70
+ from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
71
+ from sglang.srt.layers.quantization import deep_gemm_wrapper
72
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
73
+ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
74
+ from sglang.srt.layers.quantization.fp8_utils import (
75
+ block_quant_dequant,
76
+ block_quant_to_tensor_quant,
77
+ channel_quant_to_tensor_quant,
78
+ normalize_e4m3fn_to_e4m3fnuz,
79
+ requant_weight_ue8m0_inplace,
80
+ )
81
+ from sglang.srt.layers.quantization.int8_utils import (
82
+ block_dequant as int8_block_dequant,
83
+ )
84
+ from sglang.srt.layers.vocab_parallel_embedding import (
85
+ ParallelLMHead,
86
+ VocabParallelEmbedding,
87
+ )
88
+ from sglang.srt.managers.schedule_batch import global_server_args_dict
89
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
90
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
91
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
92
+ from sglang.srt.utils import (
93
+ BumpAllocator,
94
+ LazyValue,
95
+ add_prefix,
96
+ bind_or_assign,
97
+ cpu_has_amx_support,
98
+ get_bool_env_var,
99
+ get_device_sm,
100
+ get_int_env_var,
101
+ is_cpu,
102
+ is_cuda,
103
+ is_flashinfer_available,
104
+ is_hip,
105
+ is_non_idle_and_non_empty,
106
+ is_npu,
107
+ is_sm100_supported,
108
+ )
109
+
110
+ _is_hip = is_hip()
111
+ _is_cuda = is_cuda()
112
+ _is_npu = is_npu()
113
+ _is_fp8_fnuz = is_fp8_fnuz()
114
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
115
+ _is_cpu_amx_available = cpu_has_amx_support()
116
+ _is_cpu = is_cpu()
117
+ _device_sm = get_device_sm()
118
+
119
+ if _is_cuda:
120
+ from sgl_kernel import (
121
+ awq_dequantize,
122
+ bmm_fp8,
123
+ dsv3_fused_a_gemm,
124
+ dsv3_router_gemm,
125
+ merge_state_v2,
126
+ )
127
+ elif _is_cpu and _is_cpu_amx_available:
128
+ pass
129
+ elif _is_hip:
130
+ from sglang.srt.layers.quantization.awq_triton import (
131
+ awq_dequantize_triton as awq_dequantize,
132
+ )
133
+ else:
134
+ from vllm._custom_ops import awq_dequantize
135
+
136
+ logger = logging.getLogger(__name__)
137
+
138
+
139
+ class LongcatFlashMLP(nn.Module):
140
+ def __init__(
141
+ self,
142
+ hidden_size: int,
143
+ intermediate_size: int,
144
+ hidden_act: str,
145
+ quant_config: Optional[QuantizationConfig] = None,
146
+ reduce_results: bool = False,
147
+ prefix: str = "",
148
+ ) -> None:
149
+ super().__init__()
150
+ self.gate_up_proj = MergedColumnParallelLinear(
151
+ hidden_size,
152
+ [intermediate_size] * 2,
153
+ bias=False,
154
+ quant_config=quant_config,
155
+ prefix=add_prefix("gate_up_proj", prefix),
156
+ )
157
+ self.down_proj = RowParallelLinear(
158
+ intermediate_size,
159
+ hidden_size,
160
+ bias=False,
161
+ quant_config=quant_config,
162
+ reduce_results=reduce_results,
163
+ prefix=add_prefix("down_proj", prefix),
164
+ )
165
+ if hidden_act != "silu":
166
+ raise ValueError(
167
+ f"Unsupported activation: {hidden_act}. "
168
+ "Only silu is supported for now."
169
+ )
170
+ self.act_fn = SiluAndMul()
171
+
172
+ def forward(
173
+ self,
174
+ x,
175
+ ):
176
+ gate_up, _ = self.gate_up_proj(x)
177
+ x = self.act_fn(gate_up)
178
+ x, _ = self.down_proj(x)
179
+ return x
180
+
181
+
182
+ class LongcatFlashRouter(nn.Module):
183
+ def __init__(
184
+ self,
185
+ config,
186
+ zero_expert_num=0,
187
+ rounter_params_dtype=torch.float32,
188
+ prefix: str = "",
189
+ ):
190
+ super().__init__()
191
+ self.n_routed_experts = config.n_routed_experts
192
+ self.n_routed_experts = self.n_routed_experts + zero_expert_num
193
+ self.rounter_params_dtype = rounter_params_dtype
194
+ self.classifier = ReplicatedLinear(
195
+ config.hidden_size,
196
+ self.n_routed_experts,
197
+ bias=config.router_bias,
198
+ params_dtype=rounter_params_dtype,
199
+ quant_config=None,
200
+ prefix=add_prefix("classifier", prefix),
201
+ )
202
+ self.e_score_correction_bias = nn.Parameter(
203
+ torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype)
204
+ )
205
+
206
+ def forward(self, hidden_states):
207
+ logits, _ = self.classifier(hidden_states.to(self.rounter_params_dtype))
208
+ return logits
209
+
210
+
211
+ class LongcatFlashMoE(nn.Module):
212
+
213
+ def __init__(
214
+ self,
215
+ config: LongcatFlashConfig,
216
+ layer_id: int,
217
+ quant_config: Optional[QuantizationConfig] = None,
218
+ prefix: str = "",
219
+ ):
220
+ super().__init__()
221
+ self.config = config
222
+ self.layer_id = layer_id
223
+ self.routed_scaling_factor = config.routed_scaling_factor
224
+ self.num_experts = config.n_routed_experts
225
+ self.top_k = config.moe_topk
226
+ self.zero_expert_num = config.zero_expert_num
227
+ self.zero_expert_type = config.zero_expert_type
228
+
229
+ if config.rounter_params_dtype == "float32":
230
+ self.rounter_params_dtype = torch.float32
231
+ else:
232
+ self.rounter_params_dtype = torch.bfloat16
233
+
234
+ self.tp_size = get_tensor_model_parallel_world_size()
235
+
236
+ if self.tp_size > config.n_routed_experts:
237
+ raise ValueError(
238
+ f"Tensor parallel size {self.tp_size} is greater than "
239
+ f"the number of experts {config.n_routed_experts}."
240
+ )
241
+
242
+ if config.hidden_act != "silu":
243
+ raise ValueError(
244
+ f"Unsupported activation: {config.hidden_act}. "
245
+ "Only silu is supported for now."
246
+ )
247
+
248
+ self.router = LongcatFlashRouter(
249
+ config=self.config,
250
+ zero_expert_num=self.zero_expert_num,
251
+ rounter_params_dtype=self.rounter_params_dtype,
252
+ prefix=add_prefix("router", prefix),
253
+ )
254
+
255
+ self.topk = TopK(
256
+ top_k=self.top_k,
257
+ renormalize=False,
258
+ use_grouped_topk=False,
259
+ correction_bias=self.router.e_score_correction_bias.data,
260
+ )
261
+ self.topk.forward = self.topk.forward_native
262
+
263
+ self.experts = get_moe_impl_class()(
264
+ num_experts=self.num_experts,
265
+ top_k=self.top_k,
266
+ layer_id=self.layer_id,
267
+ hidden_size=config.hidden_size,
268
+ intermediate_size=config.moe_intermediate_size,
269
+ quant_config=quant_config,
270
+ prefix=add_prefix("experts", prefix),
271
+ )
272
+
273
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
274
+ num_tokens, hidden_dim = hidden_states.shape
275
+ hidden_states = hidden_states.view(-1, hidden_dim)
276
+
277
+ # router_logits: (num_tokens, n_experts)
278
+ router_logits = self.router(hidden_states)
279
+ topk_weights, topk_idx, _ = self.topk(
280
+ hidden_states,
281
+ router_logits,
282
+ )
283
+ if self.zero_expert_type is not None:
284
+ zero_expert_result = zero_experts_compute_triton(
285
+ expert_indices=topk_idx,
286
+ expert_scales=topk_weights,
287
+ num_experts=self.num_experts,
288
+ zero_expert_type=self.zero_expert_type,
289
+ hidden_states=hidden_states,
290
+ )
291
+ topk_output = StandardTopKOutput(topk_weights, topk_idx, _)
292
+
293
+ final_hidden_states = self.experts(hidden_states, topk_output)
294
+ final_hidden_states *= self.routed_scaling_factor
295
+
296
+ if self.zero_expert_type is not None and hidden_states.shape[0] > 0:
297
+ final_hidden_states += zero_expert_result.to(final_hidden_states.device)
298
+
299
+ if self.tp_size > 1:
300
+ final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
301
+
302
+ return final_hidden_states.view(num_tokens, hidden_dim)
303
+
304
+ def get_moe_weights(self):
305
+ return [
306
+ x.data
307
+ for name, x in self.experts.named_parameters()
308
+ if name not in ["correction_bias"]
309
+ ]
310
+
311
+
312
+ class LongcatFlashDecoderLayer(nn.Module):
313
+
314
+ def __init__(
315
+ self,
316
+ config: LongcatFlashConfig,
317
+ layer_id: int,
318
+ quant_config: Optional[QuantizationConfig] = None,
319
+ prefix: str = "",
320
+ alt_stream: Optional[torch.cuda.Stream] = None,
321
+ ) -> None:
322
+ super().__init__()
323
+ self.config = config
324
+ self.hidden_size = config.hidden_size
325
+ self.layer_id = layer_id
326
+ self.alt_stream = alt_stream
327
+ self.self_attn = nn.ModuleList(
328
+ [
329
+ DeepseekV2AttentionMLA(
330
+ config=config,
331
+ hidden_size=config.hidden_size,
332
+ num_heads=config.num_attention_heads,
333
+ qk_nope_head_dim=config.qk_nope_head_dim,
334
+ qk_rope_head_dim=config.qk_rope_head_dim,
335
+ v_head_dim=config.v_head_dim,
336
+ q_lora_rank=config.q_lora_rank,
337
+ kv_lora_rank=config.kv_lora_rank,
338
+ rope_theta=config.rope_theta,
339
+ rope_scaling=None,
340
+ max_position_embeddings=config.max_position_embeddings,
341
+ quant_config=(
342
+ None
343
+ if "self_attn" in getattr(config, "disable_quant_module", [])
344
+ else quant_config
345
+ ),
346
+ layer_id=layer_id * 2 + i,
347
+ reduce_results=False,
348
+ prefix=add_prefix(f"self_attn.{i}", prefix),
349
+ alt_stream=self.alt_stream,
350
+ )
351
+ for i in range(2)
352
+ ]
353
+ )
354
+
355
+ self.input_layernorm = nn.ModuleList(
356
+ [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
357
+ )
358
+ self.post_attention_layernorm = nn.ModuleList(
359
+ [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
360
+ )
361
+
362
+ self.mlps = nn.ModuleList(
363
+ [
364
+ LongcatFlashMLP(
365
+ hidden_size=config.hidden_size,
366
+ intermediate_size=config.intermediate_size,
367
+ hidden_act=config.hidden_act,
368
+ quant_config=(
369
+ None
370
+ if "mlps" in getattr(config, "disable_quant_module", [])
371
+ else quant_config
372
+ ),
373
+ prefix=add_prefix(f"mlps.{i}", prefix),
374
+ )
375
+ for i in range(2)
376
+ ]
377
+ )
378
+
379
+ self.mlp = LongcatFlashMoE(
380
+ layer_id=self.layer_id,
381
+ config=config,
382
+ quant_config=quant_config,
383
+ prefix=add_prefix("mlp", prefix),
384
+ )
385
+
386
+ self.attn_tp_size = get_attention_tp_size()
387
+ self.attn_tp_rank = get_attention_tp_rank()
388
+
389
+ self.mlp_layer_scatter_modes = [
390
+ LayerScatterModes.init_new(
391
+ layer_id=self.layer_id * 2 + i,
392
+ num_layers=config.num_hidden_layers,
393
+ is_layer_sparse=False,
394
+ is_previous_layer_sparse=False,
395
+ )
396
+ for i in range(2)
397
+ ]
398
+ self.mlp_layer_communicator = [
399
+ LayerCommunicator(
400
+ layer_scatter_modes=self.mlp_layer_scatter_modes[i],
401
+ input_layernorm=self.input_layernorm[i],
402
+ post_attention_layernorm=self.post_attention_layernorm[i],
403
+ )
404
+ for i in range(2)
405
+ ]
406
+
407
+ self.moe_layer_scatter_modes = LayerScatterModes.init_new(
408
+ layer_id=self.layer_id,
409
+ num_layers=config.num_hidden_layers,
410
+ is_layer_sparse=True,
411
+ is_previous_layer_sparse=True,
412
+ )
413
+ self.moe_layer_communicator = LayerCommunicator(
414
+ layer_scatter_modes=self.moe_layer_scatter_modes,
415
+ input_layernorm=self.input_layernorm[0],
416
+ post_attention_layernorm=self.post_attention_layernorm[0],
417
+ )
418
+
419
+ def forward(
420
+ self,
421
+ positions: torch.Tensor,
422
+ hidden_states: torch.Tensor,
423
+ forward_batch: ForwardBatch,
424
+ residual: Optional[torch.Tensor],
425
+ zero_allocator: BumpAllocator,
426
+ ) -> torch.Tensor:
427
+ # first_attn
428
+ hidden_states, residual = self.moe_layer_communicator.prepare_attn(
429
+ hidden_states, residual, forward_batch
430
+ )
431
+ if hidden_states.shape[0] != 0:
432
+ hidden_states = self.self_attn[0](
433
+ positions=positions,
434
+ hidden_states=hidden_states,
435
+ forward_batch=forward_batch,
436
+ zero_allocator=zero_allocator,
437
+ )
438
+
439
+ # moe
440
+ hidden_states, residual = self.moe_layer_communicator.prepare_mlp(
441
+ hidden_states, residual, forward_batch
442
+ )
443
+ moe_hidden_states = hidden_states.clone()
444
+ moe_residual = residual.clone()
445
+ moe_hidden_states = self.mlp(moe_hidden_states)
446
+ moe_hidden_states, moe_residual = self.moe_layer_communicator.postprocess_layer(
447
+ moe_hidden_states, moe_residual, forward_batch
448
+ )
449
+
450
+ hidden_states, residual = self.forward_mlp(
451
+ hidden_states, positions, residual, forward_batch, zero_allocator
452
+ )
453
+
454
+ hidden_states = moe_hidden_states + hidden_states
455
+ return hidden_states, residual
456
+
457
+ def forward_mlp(
458
+ self, hidden_states, positions, residual, forward_batch, zero_allocator
459
+ ):
460
+ # first_mlp
461
+ hidden_states = self.mlps[0](hidden_states)
462
+ # TP all_reduce
463
+ hidden_states = tensor_model_parallel_all_reduce(hidden_states)
464
+
465
+ # second_attn
466
+ hidden_states, residual = self.mlp_layer_communicator[1].prepare_attn(
467
+ hidden_states, residual, forward_batch
468
+ )
469
+ if hidden_states.shape[0] != 0:
470
+ hidden_states = self.self_attn[1](
471
+ positions=positions,
472
+ hidden_states=hidden_states,
473
+ forward_batch=forward_batch,
474
+ zero_allocator=zero_allocator,
475
+ )
476
+
477
+ # second_mlp
478
+ hidden_states, residual = self.mlp_layer_communicator[1].prepare_mlp(
479
+ hidden_states, residual, forward_batch
480
+ )
481
+ hidden_states = self.mlps[1](hidden_states)
482
+ # TP all_reduce
483
+ hidden_states = tensor_model_parallel_all_reduce(hidden_states)
484
+
485
+ hidden_states, residual = self.mlp_layer_communicator[1].postprocess_layer(
486
+ hidden_states, residual, forward_batch
487
+ )
488
+
489
+ return hidden_states, residual
490
+
491
+
492
+ class LongcatFlashModel(nn.Module):
493
+ fall_back_to_pt_during_load = False
494
+
495
+ def __init__(
496
+ self,
497
+ config: LongcatFlashConfig,
498
+ quant_config: Optional[QuantizationConfig] = None,
499
+ prefix: str = "",
500
+ ) -> None:
501
+ super().__init__()
502
+ self.vocab_size = config.vocab_size
503
+
504
+ self.embed_tokens = VocabParallelEmbedding(
505
+ config.vocab_size,
506
+ config.hidden_size,
507
+ enable_tp=not is_dp_attention_enabled(),
508
+ )
509
+
510
+ self.alt_stream = torch.cuda.Stream()
511
+ self.layers = nn.ModuleList(
512
+ [
513
+ LongcatFlashDecoderLayer(
514
+ config,
515
+ layer_id,
516
+ quant_config=quant_config,
517
+ prefix=add_prefix(f"layers.{layer_id}", prefix),
518
+ alt_stream=self.alt_stream,
519
+ )
520
+ for layer_id in range(config.num_hidden_layers)
521
+ ]
522
+ )
523
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
524
+
525
+ def get_input_embeddings(self) -> torch.Tensor:
526
+ return self.embed_tokens
527
+
528
+ def forward(
529
+ self,
530
+ input_ids: torch.Tensor,
531
+ positions: torch.Tensor,
532
+ forward_batch: ForwardBatch,
533
+ input_embeds: torch.Tensor = None,
534
+ ) -> torch.Tensor:
535
+ total_num_layers = len(self.layers)
536
+ device = input_embeds.device if input_embeds is not None else input_ids.device
537
+ zero_allocator = BumpAllocator(
538
+ buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
539
+ dtype=torch.float32,
540
+ device=device,
541
+ )
542
+ if input_embeds is None:
543
+ hidden_states = self.embed_tokens(input_ids)
544
+ else:
545
+ hidden_states = input_embeds
546
+
547
+ residual = None
548
+
549
+ for i in range(total_num_layers):
550
+ with get_global_expert_distribution_recorder().with_current_layer(i):
551
+ layer = self.layers[i]
552
+ hidden_states, residual = layer(
553
+ positions, hidden_states, forward_batch, residual, zero_allocator
554
+ )
555
+
556
+ if hidden_states.shape[0] != 0:
557
+ if residual is None:
558
+ hidden_states = self.norm(hidden_states)
559
+ else:
560
+ hidden_states, _ = self.norm(hidden_states, residual)
561
+ return hidden_states
562
+
563
+
564
+ class LongcatFlashForCausalLM(nn.Module):
565
+ # for quark model load
566
+ packed_modules_mapping = {}
567
+
568
+ def __init__(
569
+ self,
570
+ config: LongcatFlashConfig,
571
+ quant_config: Optional[QuantizationConfig] = None,
572
+ prefix: str = "",
573
+ ) -> None:
574
+ super().__init__()
575
+
576
+ # for quark model load
577
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
578
+ self.fuse_qkv_a_proj = (
579
+ hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
580
+ )
581
+ if self.fuse_qkv_a_proj:
582
+ self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
583
+ "q_a_proj",
584
+ "kv_a_proj_with_mqa",
585
+ ]
586
+
587
+ self.config = config
588
+ self.tp_size = get_tensor_model_parallel_world_size()
589
+ self.quant_config = quant_config
590
+ self.model = LongcatFlashModel(
591
+ config, quant_config, prefix=add_prefix("model", prefix)
592
+ )
593
+ self.lm_head = ParallelLMHead(
594
+ config.vocab_size,
595
+ config.hidden_size,
596
+ quant_config=quant_config,
597
+ prefix=add_prefix("lm_head", prefix),
598
+ use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
599
+ )
600
+ self.logits_processor = LogitsProcessor(config)
601
+
602
+ def get_input_embeddings(self) -> nn.Embedding:
603
+ return self.model.embed_tokens
604
+
605
+ @torch.no_grad()
606
+ def forward(
607
+ self,
608
+ input_ids: torch.Tensor,
609
+ positions: torch.Tensor,
610
+ forward_batch: ForwardBatch,
611
+ input_embeds: torch.Tensor = None,
612
+ ) -> torch.Tensor:
613
+ hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
614
+
615
+ return self.logits_processor(
616
+ input_ids, hidden_states, self.lm_head, forward_batch
617
+ )
618
+
619
+ def post_load_weights(self, weight_names=None):
620
+
621
+ # Perform post-processing after loading weights
622
+ if weight_names is None:
623
+ layer_ids = range(self.config.num_hidden_layers)
624
+ else:
625
+ layer_ids = set()
626
+ for name in weight_names:
627
+ if "kv_b_proj" in name:
628
+ layer_id = int(name.split(".")[2])
629
+ if layer_id < self.config.num_hidden_layers:
630
+ layer_ids.add(layer_id)
631
+
632
+ for layer_id in layer_ids:
633
+ for i in range(2):
634
+ self_attn = self.model.layers[layer_id].self_attn[i]
635
+ if hasattr(self_attn.kv_b_proj, "qweight"):
636
+ # AWQ compatible
637
+ if _is_cuda or _is_hip:
638
+ w = awq_dequantize(
639
+ self_attn.kv_b_proj.qweight,
640
+ self_attn.kv_b_proj.scales,
641
+ self_attn.kv_b_proj.qzeros,
642
+ ).T
643
+ else:
644
+ w = awq_dequantize(
645
+ self_attn.kv_b_proj.qweight,
646
+ self_attn.kv_b_proj.scales,
647
+ self_attn.kv_b_proj.qzeros,
648
+ 0,
649
+ 0,
650
+ 0,
651
+ ).T
652
+ else:
653
+ w = self_attn.kv_b_proj.weight
654
+ # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
655
+ # This may affect the accuracy of fp8 model.
656
+ # Fix deepseek v3 blockwise bmm by using deep_gemm
657
+ use_deep_gemm_bmm = False
658
+
659
+ if w.dtype in (
660
+ torch.float8_e4m3fn,
661
+ torch.float8_e4m3fnuz,
662
+ ):
663
+ if (
664
+ hasattr(self.quant_config, "weight_block_size")
665
+ and self.quant_config.weight_block_size is not None
666
+ ):
667
+ weight_block_size = self.quant_config.weight_block_size
668
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
669
+ if _is_fp8_fnuz:
670
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
671
+ weight=w,
672
+ weight_scale=self_attn.kv_b_proj.weight_scale_inv,
673
+ input_scale=None,
674
+ )
675
+ else:
676
+ weight = w
677
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
678
+
679
+ if (
680
+ _is_cuda
681
+ and weight_block_size[0] == 128
682
+ and weight_block_size[1] == 128
683
+ ):
684
+ if (
685
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
686
+ and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
687
+ and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
688
+ ):
689
+ block_scale = weight_scale
690
+ use_deep_gemm_bmm = True
691
+ else:
692
+ w = block_quant_dequant(
693
+ weight,
694
+ weight_scale,
695
+ weight_block_size,
696
+ torch.bfloat16,
697
+ )
698
+ else:
699
+ w, scale = block_quant_to_tensor_quant(
700
+ weight, weight_scale, weight_block_size
701
+ )
702
+ self_attn.w_scale = scale
703
+ else:
704
+ if _is_fp8_fnuz:
705
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
706
+ weight=w,
707
+ weight_scale=self_attn.kv_b_proj.weight_scale,
708
+ input_scale=None,
709
+ )
710
+ else:
711
+ weight = w
712
+ weight_scale = self_attn.kv_b_proj.weight_scale
713
+
714
+ w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
715
+ self_attn.w_scale = scale
716
+
717
+ if w.dtype == torch.int8:
718
+ if hasattr(self.quant_config, "weight_block_size"):
719
+ # block-wise int8 need it
720
+ weight_block_size = self.quant_config.weight_block_size
721
+ if weight_block_size is not None:
722
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
723
+ weight = w
724
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
725
+ w = int8_block_dequant(
726
+ weight, weight_scale, weight_block_size
727
+ ).to(torch.bfloat16)
728
+ else:
729
+ # channel-wise int8 need it
730
+ w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
731
+ torch.bfloat16
732
+ )
733
+
734
+ w_kc, w_vc = w.unflatten(
735
+ 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
736
+ ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
737
+ if not use_deep_gemm_bmm:
738
+ self_attn.w_kc = bind_or_assign(
739
+ self_attn.w_kc,
740
+ w_kc.transpose(1, 2).contiguous().transpose(1, 2),
741
+ )
742
+ self_attn.w_vc = bind_or_assign(
743
+ self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
744
+ )
745
+ if (
746
+ hasattr(self_attn.kv_b_proj, "weight_scale")
747
+ and self_attn.w_scale is None
748
+ ):
749
+ self_attn.w_scale = bind_or_assign(
750
+ self_attn.w_scale, self_attn.kv_b_proj.weight_scale
751
+ )
752
+ if _is_hip:
753
+ self_attn.w_scale *= 2.0
754
+ # TODO: remove this after adding FP8 support in bmm cpu kernel
755
+ if (
756
+ _is_cpu
757
+ and _is_cpu_amx_available
758
+ and w.dtype == torch.float8_e4m3fn
759
+ ):
760
+ self_attn.w_kc = (
761
+ self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
762
+ )
763
+ self_attn.w_vc = (
764
+ self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
765
+ )
766
+ else:
767
+ num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
768
+ num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
769
+ ws_kc, ws_vc = block_scale.unflatten(
770
+ 0, (-1, (num_tiles_k + num_tiles_n))
771
+ ).split([num_tiles_k, num_tiles_n], dim=1)
772
+ self_attn.w_scale_k = bind_or_assign(
773
+ self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
774
+ )
775
+ self_attn.w_scale_v = bind_or_assign(
776
+ self_attn.w_scale_v, ws_vc.contiguous()
777
+ )
778
+ self_attn.w_kc = bind_or_assign(
779
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
780
+ )
781
+ self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
782
+ self_attn.use_deep_gemm_bmm = True
783
+
784
+ if self.config.mla_scale_q_lora:
785
+ self_attn.q_a_layernorm.weight.data *= (
786
+ self.config.hidden_size / self.config.q_lora_rank
787
+ ) ** 0.5
788
+ if self.config.mla_scale_kv_lora:
789
+ self_attn.kv_a_layernorm.weight.data *= (
790
+ self.config.hidden_size / self.config.kv_lora_rank
791
+ ) ** 0.5
792
+
793
+ if (
794
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
795
+ and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
796
+ and hasattr(self.quant_config, "weight_block_size")
797
+ and self.quant_config.weight_block_size is not None
798
+ ):
799
+ self._weight_requant_ue8m0()
800
+
801
+ def _weight_requant_ue8m0(self):
802
+ weight_block_size = self.quant_config.weight_block_size
803
+
804
+ for layer_id in range(self.config.num_hidden_layers):
805
+ layer = self.model.layers[layer_id]
806
+ for i in range(2):
807
+ for module in [
808
+ layer.self_attn[i].fused_qkv_a_proj_with_mqa,
809
+ layer.self_attn[i].q_b_proj,
810
+ layer.self_attn[i].kv_b_proj,
811
+ layer.self_attn[i].o_proj,
812
+ ]:
813
+ requant_weight_ue8m0_inplace(
814
+ module.weight, module.weight_scale_inv, weight_block_size
815
+ )
816
+ mlp = layer.mlps[i]
817
+ assert isinstance(mlp, LongcatFlashMLP)
818
+ for module in [
819
+ mlp.gate_up_proj,
820
+ mlp.down_proj,
821
+ ]:
822
+ requant_weight_ue8m0_inplace(
823
+ module.weight, module.weight_scale_inv, weight_block_size
824
+ )
825
+
826
+ for layer_id in range(self.config.num_hidden_layers):
827
+ experts = layer.mlp.experts
828
+ if isinstance(experts, DeepEPMoE):
829
+ for w in [
830
+ experts.w13_weight_fp8,
831
+ experts.w2_weight_fp8,
832
+ ]:
833
+ requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
834
+
835
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
836
+
837
+ stacked_params_mapping = [
838
+ # (param_name, shard_name, shard_id)
839
+ ("gate_up_proj", "gate_proj", 0),
840
+ ("gate_up_proj", "up_proj", 1),
841
+ ]
842
+
843
+ # Params for weights, fp8 weight scales, fp8 activation scales
844
+ # (param_name, weight_name, expert_id, shard_id)
845
+ expert_params_mapping = get_moe_impl_class().make_expert_params_mapping(
846
+ ckpt_gate_proj_name="gate_proj",
847
+ ckpt_down_proj_name="down_proj",
848
+ ckpt_up_proj_name="up_proj",
849
+ num_experts=self.config.n_routed_experts,
850
+ )
851
+
852
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
853
+ fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
854
+ self.config.q_lora_rank is not None
855
+ )
856
+ cached_a_proj = {} if fuse_qkv_a_proj else None
857
+
858
+ with concurrent.futures.ThreadPoolExecutor() as executor:
859
+ futures = []
860
+ params_dict = dict(self.named_parameters())
861
+ weight_names = []
862
+ for name, loaded_weight in weights:
863
+ if "mtp" in name:
864
+ continue
865
+ weight_names.append(name)
866
+ if "rotary_emb.inv_freq" in name:
867
+ continue
868
+ for param_name, weight_name, shard_id in stacked_params_mapping:
869
+ # Skip non-stacked layers and experts (experts handled below).
870
+ if weight_name not in name:
871
+ continue
872
+ # We have mlp.experts[0].gate_proj in the checkpoint.
873
+ # Since we handle the experts below in expert_params_mapping,
874
+ # we need to skip here BEFORE we update the name, otherwise
875
+ # name will be updated to mlp.experts[0].gate_up_proj, which
876
+ # will then be updated below in expert_params_mapping
877
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
878
+ if ("mlp.experts." in name) and name not in params_dict:
879
+ continue
880
+ name = name.replace(weight_name, param_name)
881
+ # Skip loading extra bias for GPTQ models.
882
+ if name.endswith(".bias") and name not in params_dict:
883
+ continue
884
+ param = params_dict[name]
885
+ weight_loader = param.weight_loader
886
+ futures.append(
887
+ executor.submit(weight_loader, param, loaded_weight, shard_id)
888
+ )
889
+ break
890
+ else:
891
+ for mapping in expert_params_mapping:
892
+ param_name, weight_name, expert_id, shard_id = mapping
893
+ if weight_name not in name:
894
+ continue
895
+ name = name.replace(weight_name, param_name)
896
+ param = params_dict[name]
897
+ weight_loader = param.weight_loader
898
+ futures.append(
899
+ executor.submit(
900
+ weight_loader,
901
+ param,
902
+ loaded_weight,
903
+ name,
904
+ shard_id=shard_id,
905
+ expert_id=expert_id,
906
+ )
907
+ )
908
+ break
909
+ else:
910
+ # Skip loading extra bias for GPTQ models.
911
+ if name.endswith(".bias") and name not in params_dict:
912
+ continue
913
+ if fuse_qkv_a_proj and (
914
+ "q_a_proj" in name or "kv_a_proj_with_mqa" in name
915
+ ):
916
+ cached_a_proj[name] = loaded_weight
917
+ q_a_proj_name = (
918
+ name
919
+ if "q_a_proj" in name
920
+ else name.replace("kv_a_proj_with_mqa", "q_a_proj")
921
+ )
922
+ kv_a_proj_name = (
923
+ name
924
+ if "kv_a_proj_with_mqa" in name
925
+ else name.replace("q_a_proj", "kv_a_proj_with_mqa")
926
+ )
927
+
928
+ # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
929
+ if (
930
+ q_a_proj_name in cached_a_proj
931
+ and kv_a_proj_name in cached_a_proj
932
+ ):
933
+ q_a_proj_weight = cached_a_proj[q_a_proj_name]
934
+ kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
935
+ cat_dim = 0
936
+ if self.quant_config is not None and (
937
+ self.quant_config.get_name() == "awq"
938
+ or self.quant_config.get_name() == "awq_marlin"
939
+ or self.quant_config.get_name() == "moe_wna16"
940
+ ):
941
+ cat_dim = 1
942
+ fused_weight = torch.cat(
943
+ [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
944
+ )
945
+ param_name = (
946
+ name.replace(
947
+ "q_a_proj", "fused_qkv_a_proj_with_mqa"
948
+ )
949
+ if "q_a_proj" in name
950
+ else name.replace(
951
+ "kv_a_proj_with_mqa",
952
+ "fused_qkv_a_proj_with_mqa",
953
+ )
954
+ )
955
+ param = params_dict[param_name]
956
+
957
+ weight_loader = getattr(
958
+ param, "weight_loader", default_weight_loader
959
+ )
960
+ futures.append(
961
+ executor.submit(weight_loader, param, fused_weight)
962
+ )
963
+ cached_a_proj.pop(q_a_proj_name)
964
+ cached_a_proj.pop(kv_a_proj_name)
965
+ else:
966
+ if (
967
+ "k_scale" in name or "v_scale" in name
968
+ ) and name not in params_dict:
969
+ # modelopt attn kv scale is named differently
970
+ for scale in ["k_scale", "v_scale"]:
971
+ if scale in name:
972
+ name = name.replace(
973
+ f"{scale[0]}_proj", "attn_mqa"
974
+ )
975
+ break
976
+ if name not in params_dict:
977
+ # modelopt ckpt contains not needed weights for MTP module:
978
+ # model.decoder.self_attn.attn_mqa.v_scale and
979
+ # model.decoder.self_attn.attn_mqa.k_scale
980
+ logger.warning(f"{name} not found in params_dict.")
981
+ continue
982
+ param = params_dict[name]
983
+ weight_loader = getattr(
984
+ param, "weight_loader", default_weight_loader
985
+ )
986
+ futures.append(
987
+ executor.submit(weight_loader, param, loaded_weight)
988
+ )
989
+
990
+ # Wait for all tasks to complete and raise any exceptions.
991
+ for future in concurrent.futures.as_completed(futures):
992
+ future.result()
993
+
994
+ self.post_load_weights(weight_names=weight_names)
995
+
996
+ def get_embed_and_head(self):
997
+ return self.model.embed_tokens.weight, self.lm_head.weight
998
+
999
+ def set_embed_and_head(self, embed, head):
1000
+ del self.model.embed_tokens.weight
1001
+ del self.lm_head.weight
1002
+ self.model.embed_tokens.weight = embed
1003
+ self.lm_head.weight = head
1004
+ torch.cuda.empty_cache()
1005
+ torch.cuda.synchronize()
1006
+
1007
+ @classmethod
1008
+ def get_model_config_for_expert_location(cls, config):
1009
+ return ModelConfigForExpertLocation(
1010
+ num_layers=config.num_hidden_layers,
1011
+ num_logical_experts=config.n_routed_experts,
1012
+ )
1013
+
1014
+
1015
+ EntryClass = [LongcatFlashForCausalLM]