sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +2 -0
  3. sglang/srt/configs/longcat_flash.py +104 -0
  4. sglang/srt/configs/model_config.py +14 -1
  5. sglang/srt/connector/__init__.py +1 -1
  6. sglang/srt/connector/base_connector.py +1 -2
  7. sglang/srt/connector/redis.py +2 -2
  8. sglang/srt/connector/serde/__init__.py +1 -1
  9. sglang/srt/connector/serde/safe_serde.py +4 -3
  10. sglang/srt/disaggregation/ascend/conn.py +75 -0
  11. sglang/srt/disaggregation/launch_lb.py +0 -13
  12. sglang/srt/disaggregation/mini_lb.py +33 -8
  13. sglang/srt/disaggregation/prefill.py +1 -1
  14. sglang/srt/distributed/parallel_state.py +27 -15
  15. sglang/srt/entrypoints/engine.py +19 -12
  16. sglang/srt/entrypoints/http_server.py +174 -34
  17. sglang/srt/entrypoints/openai/protocol.py +60 -0
  18. sglang/srt/eplb/eplb_manager.py +26 -2
  19. sglang/srt/eplb/expert_distribution.py +29 -2
  20. sglang/srt/hf_transformers_utils.py +10 -0
  21. sglang/srt/layers/activation.py +12 -0
  22. sglang/srt/layers/attention/ascend_backend.py +240 -109
  23. sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
  24. sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
  25. sglang/srt/layers/layernorm.py +28 -3
  26. sglang/srt/layers/linear.py +3 -2
  27. sglang/srt/layers/logits_processor.py +1 -1
  28. sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
  29. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  30. sglang/srt/layers/moe/ep_moe/layer.py +14 -13
  31. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  32. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
  34. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
  37. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  38. sglang/srt/layers/moe/topk.py +35 -12
  39. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  40. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  41. sglang/srt/layers/quantization/modelopt_quant.py +7 -0
  42. sglang/srt/layers/quantization/mxfp4.py +9 -4
  43. sglang/srt/layers/quantization/utils.py +13 -0
  44. sglang/srt/layers/quantization/w4afp8.py +30 -25
  45. sglang/srt/layers/quantization/w8a8_int8.py +7 -3
  46. sglang/srt/layers/rotary_embedding.py +28 -1
  47. sglang/srt/layers/sampler.py +29 -5
  48. sglang/srt/managers/cache_controller.py +62 -96
  49. sglang/srt/managers/detokenizer_manager.py +9 -2
  50. sglang/srt/managers/io_struct.py +27 -0
  51. sglang/srt/managers/mm_utils.py +5 -1
  52. sglang/srt/managers/multi_tokenizer_mixin.py +629 -0
  53. sglang/srt/managers/scheduler.py +39 -2
  54. sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
  55. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  56. sglang/srt/managers/tokenizer_manager.py +86 -39
  57. sglang/srt/mem_cache/chunk_cache.py +1 -1
  58. sglang/srt/mem_cache/hicache_storage.py +20 -3
  59. sglang/srt/mem_cache/hiradix_cache.py +94 -71
  60. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  61. sglang/srt/mem_cache/memory_pool.py +4 -0
  62. sglang/srt/mem_cache/memory_pool_host.py +4 -4
  63. sglang/srt/mem_cache/radix_cache.py +5 -4
  64. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  65. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  66. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -9
  67. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
  68. sglang/srt/mem_cache/swa_radix_cache.py +1 -1
  69. sglang/srt/model_executor/model_runner.py +5 -4
  70. sglang/srt/model_loader/loader.py +15 -24
  71. sglang/srt/model_loader/utils.py +12 -0
  72. sglang/srt/models/deepseek_v2.py +31 -10
  73. sglang/srt/models/gpt_oss.py +5 -18
  74. sglang/srt/models/llama_eagle3.py +4 -0
  75. sglang/srt/models/longcat_flash.py +1026 -0
  76. sglang/srt/models/longcat_flash_nextn.py +699 -0
  77. sglang/srt/models/qwen2.py +26 -3
  78. sglang/srt/models/qwen2_5_vl.py +65 -41
  79. sglang/srt/models/qwen2_moe.py +22 -2
  80. sglang/srt/models/transformers.py +1 -1
  81. sglang/srt/multimodal/processors/base_processor.py +4 -2
  82. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  83. sglang/srt/server_args.py +112 -55
  84. sglang/srt/speculative/eagle_worker.py +28 -8
  85. sglang/srt/utils.py +4 -0
  86. sglang/test/attention/test_trtllm_mla_backend.py +12 -3
  87. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  88. sglang/version.py +1 -1
  89. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +5 -5
  90. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +93 -85
  91. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
  92. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
  93. {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1026 @@
1
+ # Apache License, Version 2.0:
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ #
14
+ # MIT License:
15
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ # of this software and associated documentation files (the "Software"), to deal
17
+ # in the Software without restriction, including without limitation the rights
18
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ # copies of the Software, and to permit persons to whom the Software is
20
+ # furnished to do so, subject to the following conditions:
21
+ #
22
+ # The above copyright notice and this permission notice shall be included in all
23
+ # copies or substantial portions of the Software.
24
+ #
25
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ # SOFTWARE.
32
+
33
+ import concurrent.futures
34
+ import logging
35
+ import os
36
+ from enum import IntEnum, auto
37
+ from typing import Any, Dict, Iterable, Optional, Tuple, Union
38
+
39
+ import torch
40
+ import torch.nn.functional as F
41
+ from torch import nn
42
+ from tqdm import tqdm
43
+
44
+ from sglang.srt.configs import LongcatFlashConfig
45
+ from sglang.srt.distributed import (
46
+ get_tensor_model_parallel_world_size,
47
+ tensor_model_parallel_all_reduce,
48
+ )
49
+ from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
50
+ from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
51
+ from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
52
+ from sglang.srt.layers.activation import SiluAndMul
53
+ from sglang.srt.layers.amx_utils import PackWeightMethod
54
+ from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
55
+ from sglang.srt.layers.dp_attention import (
56
+ get_attention_tp_rank,
57
+ get_attention_tp_size,
58
+ is_dp_attention_enabled,
59
+ )
60
+ from sglang.srt.layers.layernorm import RMSNorm
61
+ from sglang.srt.layers.linear import (
62
+ MergedColumnParallelLinear,
63
+ ReplicatedLinear,
64
+ RowParallelLinear,
65
+ )
66
+ from sglang.srt.layers.logits_processor import LogitsProcessor
67
+ from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton
68
+ from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
69
+ from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
70
+ from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
71
+ from sglang.srt.layers.quantization import deep_gemm_wrapper
72
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
73
+ from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
74
+ from sglang.srt.layers.quantization.fp8_utils import (
75
+ block_quant_dequant,
76
+ block_quant_to_tensor_quant,
77
+ channel_quant_to_tensor_quant,
78
+ normalize_e4m3fn_to_e4m3fnuz,
79
+ requant_weight_ue8m0_inplace,
80
+ )
81
+ from sglang.srt.layers.quantization.int8_utils import (
82
+ block_dequant as int8_block_dequant,
83
+ )
84
+ from sglang.srt.layers.vocab_parallel_embedding import (
85
+ ParallelLMHead,
86
+ VocabParallelEmbedding,
87
+ )
88
+ from sglang.srt.managers.schedule_batch import global_server_args_dict
89
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
90
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
91
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
92
+ from sglang.srt.utils import (
93
+ BumpAllocator,
94
+ LazyValue,
95
+ add_prefix,
96
+ bind_or_assign,
97
+ cpu_has_amx_support,
98
+ get_bool_env_var,
99
+ get_device_sm,
100
+ get_int_env_var,
101
+ is_cpu,
102
+ is_cuda,
103
+ is_flashinfer_available,
104
+ is_hip,
105
+ is_non_idle_and_non_empty,
106
+ is_npu,
107
+ is_sm100_supported,
108
+ )
109
+
110
+ _is_hip = is_hip()
111
+ _is_cuda = is_cuda()
112
+ _is_npu = is_npu()
113
+ _is_fp8_fnuz = is_fp8_fnuz()
114
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
115
+ _is_cpu_amx_available = cpu_has_amx_support()
116
+ _is_cpu = is_cpu()
117
+ _device_sm = get_device_sm()
118
+
119
+ if _is_cuda:
120
+ from sgl_kernel import (
121
+ awq_dequantize,
122
+ bmm_fp8,
123
+ dsv3_fused_a_gemm,
124
+ dsv3_router_gemm,
125
+ merge_state_v2,
126
+ )
127
+ elif _is_cpu and _is_cpu_amx_available:
128
+ pass
129
+ elif _is_hip:
130
+ from sglang.srt.layers.quantization.awq_triton import (
131
+ awq_dequantize_triton as awq_dequantize,
132
+ )
133
+ else:
134
+ from vllm._custom_ops import awq_dequantize
135
+
136
+ logger = logging.getLogger(__name__)
137
+
138
+
139
+ class LongcatFlashMLP(nn.Module):
140
+ def __init__(
141
+ self,
142
+ hidden_size: int,
143
+ intermediate_size: int,
144
+ hidden_act: str,
145
+ quant_config: Optional[QuantizationConfig] = None,
146
+ reduce_results: bool = False,
147
+ prefix: str = "",
148
+ ) -> None:
149
+ super().__init__()
150
+ self.gate_up_proj = MergedColumnParallelLinear(
151
+ hidden_size,
152
+ [intermediate_size] * 2,
153
+ bias=False,
154
+ quant_config=quant_config,
155
+ prefix=add_prefix("gate_up_proj", prefix),
156
+ )
157
+ self.down_proj = RowParallelLinear(
158
+ intermediate_size,
159
+ hidden_size,
160
+ bias=False,
161
+ quant_config=quant_config,
162
+ reduce_results=reduce_results,
163
+ prefix=add_prefix("down_proj", prefix),
164
+ )
165
+ if hidden_act != "silu":
166
+ raise ValueError(
167
+ f"Unsupported activation: {hidden_act}. "
168
+ "Only silu is supported for now."
169
+ )
170
+ self.act_fn = SiluAndMul()
171
+
172
+ def forward(
173
+ self,
174
+ x,
175
+ ):
176
+ gate_up, _ = self.gate_up_proj(x)
177
+ x = self.act_fn(gate_up)
178
+ x, _ = self.down_proj(x)
179
+ return x
180
+
181
+
182
+ class LongcatFlashRouter(nn.Module):
183
+ def __init__(
184
+ self,
185
+ config,
186
+ zero_expert_num=0,
187
+ rounter_params_dtype=torch.float32,
188
+ prefix: str = "",
189
+ ):
190
+ super().__init__()
191
+ self.n_routed_experts = config.n_routed_experts
192
+ self.n_routed_experts = self.n_routed_experts + zero_expert_num
193
+ self.rounter_params_dtype = rounter_params_dtype
194
+ self.classifier = ReplicatedLinear(
195
+ config.hidden_size,
196
+ self.n_routed_experts,
197
+ bias=config.router_bias,
198
+ params_dtype=rounter_params_dtype,
199
+ quant_config=None,
200
+ prefix=add_prefix("classifier", prefix),
201
+ )
202
+ self.e_score_correction_bias = nn.Parameter(
203
+ torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype)
204
+ )
205
+
206
+ def forward(self, hidden_states):
207
+ logits, _ = self.classifier(hidden_states.to(self.rounter_params_dtype))
208
+ return logits
209
+
210
+
211
+ class LongcatFlashMoE(nn.Module):
212
+
213
+ def __init__(
214
+ self,
215
+ config: LongcatFlashConfig,
216
+ layer_id: int,
217
+ quant_config: Optional[QuantizationConfig] = None,
218
+ prefix: str = "",
219
+ ):
220
+ super().__init__()
221
+ self.config = config
222
+ self.layer_id = layer_id
223
+ self.routed_scaling_factor = config.routed_scaling_factor
224
+ self.num_experts = config.n_routed_experts
225
+ self.top_k = config.moe_topk
226
+ self.zero_expert_num = config.zero_expert_num
227
+ self.zero_expert_type = config.zero_expert_type
228
+
229
+ if config.rounter_params_dtype == "float32":
230
+ self.rounter_params_dtype = torch.float32
231
+ else:
232
+ self.rounter_params_dtype = torch.bfloat16
233
+
234
+ self.tp_size = get_tensor_model_parallel_world_size()
235
+
236
+ if self.tp_size > config.n_routed_experts:
237
+ raise ValueError(
238
+ f"Tensor parallel size {self.tp_size} is greater than "
239
+ f"the number of experts {config.n_routed_experts}."
240
+ )
241
+
242
+ if config.hidden_act != "silu":
243
+ raise ValueError(
244
+ f"Unsupported activation: {config.hidden_act}. "
245
+ "Only silu is supported for now."
246
+ )
247
+
248
+ self.router = LongcatFlashRouter(
249
+ config=self.config,
250
+ zero_expert_num=self.zero_expert_num,
251
+ rounter_params_dtype=self.rounter_params_dtype,
252
+ prefix=add_prefix("router", prefix),
253
+ )
254
+
255
+ self.topk = TopK(
256
+ top_k=self.top_k,
257
+ renormalize=False,
258
+ use_grouped_topk=False,
259
+ correction_bias=self.router.e_score_correction_bias.data,
260
+ )
261
+ self.topk.forward = self.topk.forward_native
262
+
263
+ self.experts = get_moe_impl_class()(
264
+ num_experts=self.num_experts,
265
+ top_k=self.top_k,
266
+ layer_id=self.layer_id,
267
+ hidden_size=config.hidden_size,
268
+ intermediate_size=config.moe_intermediate_size,
269
+ quant_config=quant_config,
270
+ prefix=add_prefix("experts", prefix),
271
+ )
272
+
273
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
274
+ num_tokens, hidden_dim = hidden_states.shape
275
+ hidden_states = hidden_states.view(-1, hidden_dim)
276
+
277
+ # router_logits: (num_tokens, n_experts)
278
+ router_logits = self.router(hidden_states)
279
+ topk_weights, topk_idx, _ = self.topk(
280
+ hidden_states,
281
+ router_logits,
282
+ )
283
+ if self.zero_expert_type is not None:
284
+ zero_expert_result = zero_experts_compute_triton(
285
+ expert_indices=topk_idx,
286
+ expert_scales=topk_weights,
287
+ num_experts=self.num_experts,
288
+ zero_expert_type=self.zero_expert_type,
289
+ hidden_states=hidden_states,
290
+ )
291
+ topk_output = StandardTopKOutput(topk_weights, topk_idx, _)
292
+
293
+ final_hidden_states = self.experts(hidden_states, topk_output)
294
+ final_hidden_states *= self.routed_scaling_factor
295
+
296
+ if self.zero_expert_type is not None and hidden_states.shape[0] > 0:
297
+ final_hidden_states += zero_expert_result.to(final_hidden_states.device)
298
+
299
+ if self.tp_size > 1:
300
+ final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
301
+
302
+ return final_hidden_states.view(num_tokens, hidden_dim)
303
+
304
+ def get_moe_weights(self):
305
+ return [
306
+ x.data
307
+ for name, x in self.experts.named_parameters()
308
+ if name not in ["correction_bias"]
309
+ ]
310
+
311
+
312
+ class LongcatFlashDecoderLayer(nn.Module):
313
+
314
+ def __init__(
315
+ self,
316
+ config: LongcatFlashConfig,
317
+ layer_id: int,
318
+ quant_config: Optional[QuantizationConfig] = None,
319
+ prefix: str = "",
320
+ alt_stream: Optional[torch.cuda.Stream] = None,
321
+ ) -> None:
322
+ super().__init__()
323
+ self.config = config
324
+ self.hidden_size = config.hidden_size
325
+ self.layer_id = layer_id
326
+ self.alt_stream = alt_stream
327
+ self.self_attn = nn.ModuleList(
328
+ [
329
+ DeepseekV2AttentionMLA(
330
+ config=config,
331
+ hidden_size=config.hidden_size,
332
+ num_heads=config.num_attention_heads,
333
+ qk_nope_head_dim=config.qk_nope_head_dim,
334
+ qk_rope_head_dim=config.qk_rope_head_dim,
335
+ v_head_dim=config.v_head_dim,
336
+ q_lora_rank=config.q_lora_rank,
337
+ kv_lora_rank=config.kv_lora_rank,
338
+ rope_theta=config.rope_theta,
339
+ rope_scaling=None,
340
+ max_position_embeddings=config.max_position_embeddings,
341
+ quant_config=(
342
+ None
343
+ if "self_attn" in getattr(config, "disable_quant_module", [])
344
+ else quant_config
345
+ ),
346
+ layer_id=layer_id * 2 + i,
347
+ reduce_results=False,
348
+ prefix=add_prefix(f"self_attn.{i}", prefix),
349
+ alt_stream=self.alt_stream,
350
+ )
351
+ for i in range(2)
352
+ ]
353
+ )
354
+
355
+ self.input_layernorm = nn.ModuleList(
356
+ [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
357
+ )
358
+ self.post_attention_layernorm = nn.ModuleList(
359
+ [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
360
+ )
361
+
362
+ self.mlps = nn.ModuleList(
363
+ [
364
+ LongcatFlashMLP(
365
+ hidden_size=config.hidden_size,
366
+ intermediate_size=config.intermediate_size,
367
+ hidden_act=config.hidden_act,
368
+ quant_config=(
369
+ None
370
+ if "mlps" in getattr(config, "disable_quant_module", [])
371
+ else quant_config
372
+ ),
373
+ prefix=add_prefix(f"mlps.{i}", prefix),
374
+ )
375
+ for i in range(2)
376
+ ]
377
+ )
378
+
379
+ self.mlp = LongcatFlashMoE(
380
+ layer_id=self.layer_id,
381
+ config=config,
382
+ quant_config=quant_config,
383
+ prefix=add_prefix("mlp", prefix),
384
+ )
385
+
386
+ self.attn_tp_size = get_attention_tp_size()
387
+ self.attn_tp_rank = get_attention_tp_rank()
388
+
389
+ self.mlp_layer_scatter_modes = [
390
+ LayerScatterModes.init_new(
391
+ layer_id=self.layer_id * 2 + i,
392
+ num_layers=config.num_hidden_layers,
393
+ is_layer_sparse=False,
394
+ is_previous_layer_sparse=False,
395
+ )
396
+ for i in range(2)
397
+ ]
398
+ self.mlp_layer_communicator = [
399
+ LayerCommunicator(
400
+ layer_scatter_modes=self.mlp_layer_scatter_modes[i],
401
+ input_layernorm=self.input_layernorm[i],
402
+ post_attention_layernorm=self.post_attention_layernorm[i],
403
+ )
404
+ for i in range(2)
405
+ ]
406
+
407
+ self.moe_layer_scatter_modes = LayerScatterModes.init_new(
408
+ layer_id=self.layer_id,
409
+ num_layers=config.num_hidden_layers,
410
+ is_layer_sparse=True,
411
+ is_previous_layer_sparse=True,
412
+ )
413
+ self.moe_layer_communicator = LayerCommunicator(
414
+ layer_scatter_modes=self.moe_layer_scatter_modes,
415
+ input_layernorm=self.input_layernorm[0],
416
+ post_attention_layernorm=self.post_attention_layernorm[0],
417
+ )
418
+
419
+ def forward(
420
+ self,
421
+ positions: torch.Tensor,
422
+ hidden_states: torch.Tensor,
423
+ forward_batch: ForwardBatch,
424
+ residual: Optional[torch.Tensor],
425
+ zero_allocator: BumpAllocator,
426
+ ) -> torch.Tensor:
427
+ # first_attn
428
+ hidden_states, residual = self.moe_layer_communicator.prepare_attn(
429
+ hidden_states, residual, forward_batch
430
+ )
431
+ if hidden_states.shape[0] != 0:
432
+ hidden_states = self.self_attn[0](
433
+ positions=positions,
434
+ hidden_states=hidden_states,
435
+ forward_batch=forward_batch,
436
+ zero_allocator=zero_allocator,
437
+ )
438
+
439
+ # moe
440
+ hidden_states, residual = self.moe_layer_communicator.prepare_mlp(
441
+ hidden_states, residual, forward_batch
442
+ )
443
+ moe_hidden_states = hidden_states.clone()
444
+ moe_residual = residual.clone()
445
+ moe_hidden_states = self.mlp(moe_hidden_states)
446
+ moe_hidden_states, moe_residual = self.moe_layer_communicator.postprocess_layer(
447
+ moe_hidden_states, moe_residual, forward_batch
448
+ )
449
+
450
+ hidden_states, residual = self.forward_mlp(
451
+ hidden_states, positions, residual, forward_batch, zero_allocator
452
+ )
453
+
454
+ hidden_states = moe_hidden_states + hidden_states
455
+ return hidden_states, residual
456
+
457
+ def forward_mlp(
458
+ self, hidden_states, positions, residual, forward_batch, zero_allocator
459
+ ):
460
+ # first_mlp
461
+ hidden_states = self.mlps[0](hidden_states)
462
+ # TP all_reduce
463
+ hidden_states = tensor_model_parallel_all_reduce(hidden_states)
464
+
465
+ # second_attn
466
+ hidden_states, residual = self.mlp_layer_communicator[1].prepare_attn(
467
+ hidden_states, residual, forward_batch
468
+ )
469
+ if hidden_states.shape[0] != 0:
470
+ hidden_states = self.self_attn[1](
471
+ positions=positions,
472
+ hidden_states=hidden_states,
473
+ forward_batch=forward_batch,
474
+ zero_allocator=zero_allocator,
475
+ )
476
+
477
+ # second_mlp
478
+ hidden_states, residual = self.mlp_layer_communicator[1].prepare_mlp(
479
+ hidden_states, residual, forward_batch
480
+ )
481
+ hidden_states = self.mlps[1](hidden_states)
482
+ # TP all_reduce
483
+ hidden_states = tensor_model_parallel_all_reduce(hidden_states)
484
+
485
+ hidden_states, residual = self.mlp_layer_communicator[1].postprocess_layer(
486
+ hidden_states, residual, forward_batch
487
+ )
488
+
489
+ return hidden_states, residual
490
+
491
+
492
+ class LongcatFlashModel(nn.Module):
493
+ fall_back_to_pt_during_load = False
494
+
495
+ def __init__(
496
+ self,
497
+ config: LongcatFlashConfig,
498
+ quant_config: Optional[QuantizationConfig] = None,
499
+ prefix: str = "",
500
+ ) -> None:
501
+ super().__init__()
502
+ self.vocab_size = config.vocab_size
503
+
504
+ self.embed_tokens = VocabParallelEmbedding(
505
+ config.vocab_size,
506
+ config.hidden_size,
507
+ enable_tp=not is_dp_attention_enabled(),
508
+ )
509
+
510
+ self.alt_stream = torch.cuda.Stream()
511
+ self.layers = nn.ModuleList(
512
+ [
513
+ LongcatFlashDecoderLayer(
514
+ config,
515
+ layer_id,
516
+ quant_config=quant_config,
517
+ prefix=add_prefix(f"layers.{layer_id}", prefix),
518
+ alt_stream=self.alt_stream,
519
+ )
520
+ for layer_id in range(config.num_hidden_layers)
521
+ ]
522
+ )
523
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
524
+
525
+ def get_input_embeddings(self) -> torch.Tensor:
526
+ return self.embed_tokens
527
+
528
+ def forward(
529
+ self,
530
+ input_ids: torch.Tensor,
531
+ positions: torch.Tensor,
532
+ forward_batch: ForwardBatch,
533
+ input_embeds: torch.Tensor = None,
534
+ ) -> torch.Tensor:
535
+ total_num_layers = len(self.layers)
536
+ device = input_embeds.device if input_embeds is not None else input_ids.device
537
+ zero_allocator = BumpAllocator(
538
+ buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
539
+ dtype=torch.float32,
540
+ device=device,
541
+ )
542
+ if input_embeds is None:
543
+ hidden_states = self.embed_tokens(input_ids)
544
+ else:
545
+ hidden_states = input_embeds
546
+
547
+ residual = None
548
+
549
+ for i in range(total_num_layers):
550
+ with get_global_expert_distribution_recorder().with_current_layer(i):
551
+ layer = self.layers[i]
552
+ hidden_states, residual = layer(
553
+ positions, hidden_states, forward_batch, residual, zero_allocator
554
+ )
555
+
556
+ if hidden_states.shape[0] != 0:
557
+ if residual is None:
558
+ hidden_states = self.norm(hidden_states)
559
+ else:
560
+ hidden_states, _ = self.norm(hidden_states, residual)
561
+ return hidden_states
562
+
563
+
564
+ class LongcatFlashForCausalLM(nn.Module):
565
+ # for quark model load
566
+ packed_modules_mapping = {}
567
+
568
+ def __init__(
569
+ self,
570
+ config: LongcatFlashConfig,
571
+ quant_config: Optional[QuantizationConfig] = None,
572
+ prefix: str = "",
573
+ ) -> None:
574
+ super().__init__()
575
+
576
+ # for quark model load
577
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
578
+ self.fuse_qkv_a_proj = (
579
+ hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
580
+ )
581
+ if self.fuse_qkv_a_proj:
582
+ self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
583
+ "q_a_proj",
584
+ "kv_a_proj_with_mqa",
585
+ ]
586
+
587
+ self.config = config
588
+ self.tp_size = get_tensor_model_parallel_world_size()
589
+ self.quant_config = quant_config
590
+ self.model = LongcatFlashModel(
591
+ config, quant_config, prefix=add_prefix("model", prefix)
592
+ )
593
+ self.lm_head = ParallelLMHead(
594
+ config.vocab_size,
595
+ config.hidden_size,
596
+ quant_config=quant_config,
597
+ prefix=add_prefix("lm_head", prefix),
598
+ use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
599
+ )
600
+ self.logits_processor = LogitsProcessor(config)
601
+
602
+ def get_input_embeddings(self) -> nn.Embedding:
603
+ return self.model.embed_tokens
604
+
605
+ @torch.no_grad()
606
+ def forward(
607
+ self,
608
+ input_ids: torch.Tensor,
609
+ positions: torch.Tensor,
610
+ forward_batch: ForwardBatch,
611
+ input_embeds: torch.Tensor = None,
612
+ ) -> torch.Tensor:
613
+ hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
614
+
615
+ return self.logits_processor(
616
+ input_ids, hidden_states, self.lm_head, forward_batch
617
+ )
618
+
619
+ def post_load_weights(self, weight_names=None):
620
+
621
+ # Perform post-processing after loading weights
622
+ if weight_names is None:
623
+ layer_ids = range(self.config.num_hidden_layers)
624
+ else:
625
+ layer_ids = set()
626
+ for name in weight_names:
627
+ if "kv_b_proj" in name:
628
+ layer_id = int(name.split(".")[2])
629
+ if layer_id < self.config.num_hidden_layers:
630
+ layer_ids.add(layer_id)
631
+
632
+ for layer_id in layer_ids:
633
+ for i in range(2):
634
+ self_attn = self.model.layers[layer_id].self_attn[i]
635
+ if hasattr(self_attn.kv_b_proj, "qweight"):
636
+ # AWQ compatible
637
+ if _is_cuda or _is_hip:
638
+ w = awq_dequantize(
639
+ self_attn.kv_b_proj.qweight,
640
+ self_attn.kv_b_proj.scales,
641
+ self_attn.kv_b_proj.qzeros,
642
+ ).T
643
+ else:
644
+ w = awq_dequantize(
645
+ self_attn.kv_b_proj.qweight,
646
+ self_attn.kv_b_proj.scales,
647
+ self_attn.kv_b_proj.qzeros,
648
+ 0,
649
+ 0,
650
+ 0,
651
+ ).T
652
+ else:
653
+ w = self_attn.kv_b_proj.weight
654
+ use_deep_gemm_bmm = False
655
+
656
+ if w.dtype in (
657
+ torch.float8_e4m3fn,
658
+ torch.float8_e4m3fnuz,
659
+ ):
660
+ if (
661
+ hasattr(self.quant_config, "weight_block_size")
662
+ and self.quant_config.weight_block_size is not None
663
+ ):
664
+ weight_block_size = self.quant_config.weight_block_size
665
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
666
+ if _is_fp8_fnuz:
667
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
668
+ weight=w,
669
+ weight_scale=self_attn.kv_b_proj.weight_scale_inv,
670
+ input_scale=None,
671
+ )
672
+ else:
673
+ weight = w
674
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
675
+
676
+ if (
677
+ _is_cuda
678
+ and weight_block_size[0] == 128
679
+ and weight_block_size[1] == 128
680
+ ):
681
+ if (
682
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
683
+ and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
684
+ and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
685
+ ):
686
+ block_scale = weight_scale
687
+ use_deep_gemm_bmm = True
688
+ else:
689
+ w = block_quant_dequant(
690
+ weight,
691
+ weight_scale,
692
+ weight_block_size,
693
+ torch.bfloat16,
694
+ )
695
+ else:
696
+ w, scale = block_quant_to_tensor_quant(
697
+ weight, weight_scale, weight_block_size
698
+ )
699
+ self_attn.w_scale = scale
700
+ else:
701
+ if _is_fp8_fnuz:
702
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
703
+ weight=w,
704
+ weight_scale=self_attn.kv_b_proj.weight_scale,
705
+ input_scale=None,
706
+ )
707
+ else:
708
+ weight = w
709
+ weight_scale = self_attn.kv_b_proj.weight_scale
710
+
711
+ w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
712
+ self_attn.w_scale = scale
713
+
714
+ if w.dtype == torch.int8:
715
+ if hasattr(self.quant_config, "weight_block_size"):
716
+ # block-wise int8 need it
717
+ weight_block_size = self.quant_config.weight_block_size
718
+ if weight_block_size is not None:
719
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
720
+ weight = w
721
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
722
+ w = int8_block_dequant(
723
+ weight, weight_scale, weight_block_size
724
+ ).to(torch.bfloat16)
725
+ else:
726
+ # channel-wise int8 need it
727
+ w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
728
+ torch.bfloat16
729
+ )
730
+
731
+ w_kc, w_vc = w.unflatten(
732
+ 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
733
+ ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
734
+ if not use_deep_gemm_bmm:
735
+ self_attn.w_kc = bind_or_assign(
736
+ self_attn.w_kc,
737
+ w_kc.transpose(1, 2).contiguous().transpose(1, 2),
738
+ )
739
+ self_attn.w_vc = bind_or_assign(
740
+ self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
741
+ )
742
+ if (
743
+ hasattr(self_attn.kv_b_proj, "weight_scale")
744
+ and self_attn.w_scale is None
745
+ ):
746
+ self_attn.w_scale = bind_or_assign(
747
+ self_attn.w_scale, self_attn.kv_b_proj.weight_scale
748
+ )
749
+ if _is_hip:
750
+ self_attn.w_scale *= 2.0
751
+ # TODO: remove this after adding FP8 support in bmm cpu kernel
752
+ if (
753
+ _is_cpu
754
+ and _is_cpu_amx_available
755
+ and w.dtype == torch.float8_e4m3fn
756
+ ):
757
+ self_attn.w_kc = (
758
+ self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
759
+ )
760
+ self_attn.w_vc = (
761
+ self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
762
+ )
763
+ else:
764
+ num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
765
+ num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
766
+ ws_kc, ws_vc = block_scale.unflatten(
767
+ 0, (-1, (num_tiles_k + num_tiles_n))
768
+ ).split([num_tiles_k, num_tiles_n], dim=1)
769
+ self_attn.w_scale_k = bind_or_assign(
770
+ self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
771
+ )
772
+ self_attn.w_scale_v = bind_or_assign(
773
+ self_attn.w_scale_v, ws_vc.contiguous()
774
+ )
775
+ self_attn.w_kc = bind_or_assign(
776
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
777
+ )
778
+ self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
779
+ self_attn.use_deep_gemm_bmm = True
780
+
781
+ if self.config.mla_scale_q_lora:
782
+ self_attn.q_a_layernorm.weight.data *= (
783
+ self.config.hidden_size / self.config.q_lora_rank
784
+ ) ** 0.5
785
+ if self.config.mla_scale_kv_lora:
786
+ self_attn.kv_a_layernorm.weight.data *= (
787
+ self.config.hidden_size / self.config.kv_lora_rank
788
+ ) ** 0.5
789
+
790
+ # TODO(linguoyuan) EPMoE not support DEEPGEMM_BLACKWELL, DeepEP needs to be supported in the future
791
+ deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0 = False
792
+
793
+ if (
794
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
795
+ and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
796
+ and hasattr(self.quant_config, "weight_block_size")
797
+ and self.quant_config.weight_block_size is not None
798
+ ):
799
+ self._weight_requant_ue8m0()
800
+
801
+ def _weight_requant_ue8m0(self):
802
+ weight_block_size = self.quant_config.weight_block_size
803
+
804
+ for layer_id in range(self.config.num_hidden_layers):
805
+ layer = self.model.layers[layer_id]
806
+ for i in range(2):
807
+ self_attn = layer.self_attn[i]
808
+ module_list = [
809
+ self_attn.kv_b_proj,
810
+ self_attn.o_proj,
811
+ ]
812
+
813
+ if self.config.q_lora_rank is not None:
814
+ module_list.append(self_attn.fused_qkv_a_proj_with_mqa)
815
+ module_list.append(self_attn.q_b_proj)
816
+ else:
817
+ module_list.append(self_attn.kv_a_proj_with_mqa)
818
+ module_list.append(self_attn.q_proj)
819
+
820
+ for module in module_list:
821
+ if hasattr(module, "weight_scale_inv"):
822
+ requant_weight_ue8m0_inplace(
823
+ module.weight, module.weight_scale_inv, weight_block_size
824
+ )
825
+
826
+ mlp = layer.mlps[i]
827
+ assert isinstance(mlp, LongcatFlashMLP)
828
+ for module in [
829
+ mlp.gate_up_proj,
830
+ mlp.down_proj,
831
+ ]:
832
+ if hasattr(module, "weight_scale_inv"):
833
+ requant_weight_ue8m0_inplace(
834
+ module.weight, module.weight_scale_inv, weight_block_size
835
+ )
836
+
837
+ for layer_id in range(self.config.num_hidden_layers):
838
+ experts = layer.mlp.experts
839
+ if isinstance(experts, DeepEPMoE):
840
+ for w in [
841
+ experts.w13_weight_fp8,
842
+ experts.w2_weight_fp8,
843
+ ]:
844
+ requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
845
+
846
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
847
+
848
+ stacked_params_mapping = [
849
+ # (param_name, shard_name, shard_id)
850
+ ("gate_up_proj", "gate_proj", 0),
851
+ ("gate_up_proj", "up_proj", 1),
852
+ ]
853
+
854
+ # Params for weights, fp8 weight scales, fp8 activation scales
855
+ # (param_name, weight_name, expert_id, shard_id)
856
+ expert_params_mapping = get_moe_impl_class().make_expert_params_mapping(
857
+ ckpt_gate_proj_name="gate_proj",
858
+ ckpt_down_proj_name="down_proj",
859
+ ckpt_up_proj_name="up_proj",
860
+ num_experts=self.config.n_routed_experts,
861
+ )
862
+
863
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
864
+ fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
865
+ self.config.q_lora_rank is not None
866
+ )
867
+ cached_a_proj = {} if fuse_qkv_a_proj else None
868
+
869
+ with concurrent.futures.ThreadPoolExecutor() as executor:
870
+ futures = []
871
+ params_dict = dict(self.named_parameters())
872
+ weight_names = []
873
+ for name, loaded_weight in weights:
874
+ if "mtp" in name:
875
+ continue
876
+ weight_names.append(name)
877
+ if "rotary_emb.inv_freq" in name:
878
+ continue
879
+ for param_name, weight_name, shard_id in stacked_params_mapping:
880
+ # Skip non-stacked layers and experts (experts handled below).
881
+ if weight_name not in name:
882
+ continue
883
+ # We have mlp.experts[0].gate_proj in the checkpoint.
884
+ # Since we handle the experts below in expert_params_mapping,
885
+ # we need to skip here BEFORE we update the name, otherwise
886
+ # name will be updated to mlp.experts[0].gate_up_proj, which
887
+ # will then be updated below in expert_params_mapping
888
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
889
+ if ("mlp.experts." in name) and name not in params_dict:
890
+ continue
891
+ name = name.replace(weight_name, param_name)
892
+ # Skip loading extra bias for GPTQ models.
893
+ if name.endswith(".bias") and name not in params_dict:
894
+ continue
895
+ param = params_dict[name]
896
+ weight_loader = param.weight_loader
897
+ futures.append(
898
+ executor.submit(weight_loader, param, loaded_weight, shard_id)
899
+ )
900
+ break
901
+ else:
902
+ for mapping in expert_params_mapping:
903
+ param_name, weight_name, expert_id, shard_id = mapping
904
+ if weight_name not in name:
905
+ continue
906
+ name = name.replace(weight_name, param_name)
907
+ param = params_dict[name]
908
+ weight_loader = param.weight_loader
909
+ futures.append(
910
+ executor.submit(
911
+ weight_loader,
912
+ param,
913
+ loaded_weight,
914
+ name,
915
+ shard_id=shard_id,
916
+ expert_id=expert_id,
917
+ )
918
+ )
919
+ break
920
+ else:
921
+ # Skip loading extra bias for GPTQ models.
922
+ if name.endswith(".bias") and name not in params_dict:
923
+ continue
924
+ if fuse_qkv_a_proj and (
925
+ "q_a_proj" in name or "kv_a_proj_with_mqa" in name
926
+ ):
927
+ cached_a_proj[name] = loaded_weight
928
+ q_a_proj_name = (
929
+ name
930
+ if "q_a_proj" in name
931
+ else name.replace("kv_a_proj_with_mqa", "q_a_proj")
932
+ )
933
+ kv_a_proj_name = (
934
+ name
935
+ if "kv_a_proj_with_mqa" in name
936
+ else name.replace("q_a_proj", "kv_a_proj_with_mqa")
937
+ )
938
+
939
+ # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
940
+ if (
941
+ q_a_proj_name in cached_a_proj
942
+ and kv_a_proj_name in cached_a_proj
943
+ ):
944
+ q_a_proj_weight = cached_a_proj[q_a_proj_name]
945
+ kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
946
+ cat_dim = 0
947
+ if self.quant_config is not None and (
948
+ self.quant_config.get_name() == "awq"
949
+ or self.quant_config.get_name() == "awq_marlin"
950
+ or self.quant_config.get_name() == "moe_wna16"
951
+ ):
952
+ cat_dim = 1
953
+ fused_weight = torch.cat(
954
+ [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
955
+ )
956
+ param_name = (
957
+ name.replace(
958
+ "q_a_proj", "fused_qkv_a_proj_with_mqa"
959
+ )
960
+ if "q_a_proj" in name
961
+ else name.replace(
962
+ "kv_a_proj_with_mqa",
963
+ "fused_qkv_a_proj_with_mqa",
964
+ )
965
+ )
966
+ param = params_dict[param_name]
967
+
968
+ weight_loader = getattr(
969
+ param, "weight_loader", default_weight_loader
970
+ )
971
+ futures.append(
972
+ executor.submit(weight_loader, param, fused_weight)
973
+ )
974
+ cached_a_proj.pop(q_a_proj_name)
975
+ cached_a_proj.pop(kv_a_proj_name)
976
+ else:
977
+ if (
978
+ "k_scale" in name or "v_scale" in name
979
+ ) and name not in params_dict:
980
+ # modelopt attn kv scale is named differently
981
+ for scale in ["k_scale", "v_scale"]:
982
+ if scale in name:
983
+ name = name.replace(
984
+ f"{scale[0]}_proj", "attn_mqa"
985
+ )
986
+ break
987
+ if name not in params_dict:
988
+ # modelopt ckpt contains not needed weights for MTP module:
989
+ # model.decoder.self_attn.attn_mqa.v_scale and
990
+ # model.decoder.self_attn.attn_mqa.k_scale
991
+ logger.warning(f"{name} not found in params_dict.")
992
+ continue
993
+ param = params_dict[name]
994
+ weight_loader = getattr(
995
+ param, "weight_loader", default_weight_loader
996
+ )
997
+ futures.append(
998
+ executor.submit(weight_loader, param, loaded_weight)
999
+ )
1000
+
1001
+ # Wait for all tasks to complete and raise any exceptions.
1002
+ for future in concurrent.futures.as_completed(futures):
1003
+ future.result()
1004
+
1005
+ self.post_load_weights(weight_names=weight_names)
1006
+
1007
+ def get_embed_and_head(self):
1008
+ return self.model.embed_tokens.weight, self.lm_head.weight
1009
+
1010
+ def set_embed_and_head(self, embed, head):
1011
+ del self.model.embed_tokens.weight
1012
+ del self.lm_head.weight
1013
+ self.model.embed_tokens.weight = embed
1014
+ self.lm_head.weight = head
1015
+ torch.cuda.empty_cache()
1016
+ torch.cuda.synchronize()
1017
+
1018
+ @classmethod
1019
+ def get_model_config_for_expert_location(cls, config):
1020
+ return ModelConfigForExpertLocation(
1021
+ num_layers=config.num_hidden_layers,
1022
+ num_logical_experts=config.n_routed_experts,
1023
+ )
1024
+
1025
+
1026
+ EntryClass = [LongcatFlashForCausalLM]