sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +23 -3
- sglang/srt/entrypoints/openai/protocol.py +3 -1
- sglang/srt/entrypoints/openai/serving_base.py +5 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +98 -603
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
- sglang/srt/layers/moe/topk.py +6 -2
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/modelopt_quant.py +2 -0
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +6 -0
- sglang/srt/managers/io_struct.py +12 -2
- sglang/srt/managers/scheduler.py +116 -669
- sglang/srt/managers/scheduler_input_blocker.py +106 -0
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +166 -83
- sglang/srt/managers/tp_worker.py +9 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +20 -13
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +15 -56
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/qwen3_moe.py +12 -69
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/poll_based_barrier.py +31 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +18 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/srt/two_batch_overlap.py +8 -3
- sglang/test/test_utils.py +53 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,16 @@ from typing import List, Optional, Tuple
|
|
7
7
|
import torch
|
8
8
|
|
9
9
|
from sglang.srt.distributed import (
|
10
|
+
get_moe_expert_parallel_rank,
|
11
|
+
get_moe_expert_parallel_world_size,
|
12
|
+
get_moe_tensor_parallel_rank,
|
13
|
+
get_moe_tensor_parallel_world_size,
|
10
14
|
get_tensor_model_parallel_rank,
|
11
15
|
get_tensor_model_parallel_world_size,
|
12
16
|
tensor_model_parallel_all_reduce,
|
13
17
|
)
|
14
|
-
from sglang.srt.
|
18
|
+
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
|
19
|
+
from sglang.srt.layers.moe.topk import StandardTopKOutput
|
15
20
|
from sglang.srt.layers.quantization.base_config import (
|
16
21
|
QuantizationConfig,
|
17
22
|
QuantizeMethodBase,
|
@@ -62,8 +67,9 @@ class FusedMoE(torch.nn.Module):
|
|
62
67
|
num_experts: int,
|
63
68
|
hidden_size: int,
|
64
69
|
intermediate_size: int,
|
70
|
+
layer_id: int,
|
65
71
|
top_k: Optional[int] = None,
|
66
|
-
|
72
|
+
num_fused_shared_experts: int = 0,
|
67
73
|
params_dtype: Optional[torch.dtype] = None,
|
68
74
|
reduce_results: bool = False,
|
69
75
|
quant_config: Optional[QuantizationConfig] = None,
|
@@ -77,21 +83,19 @@ class FusedMoE(torch.nn.Module):
|
|
77
83
|
routed_scaling_factor: Optional[float] = None,
|
78
84
|
enable_flashinfer_cutlass_moe: Optional[bool] = False,
|
79
85
|
enable_ep_moe: Optional[bool] = False,
|
80
|
-
skip_quant: Optional[bool] = False,
|
81
86
|
):
|
82
87
|
super().__init__()
|
83
88
|
|
84
89
|
if params_dtype is None:
|
85
90
|
params_dtype = torch.get_default_dtype()
|
86
91
|
|
92
|
+
self.layer_id = layer_id
|
87
93
|
self.top_k = top_k
|
88
94
|
self.hidden_size = hidden_size
|
89
|
-
self.tp_size = (
|
90
|
-
tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
|
91
|
-
)
|
92
|
-
self.tp_rank = get_tensor_model_parallel_rank()
|
93
95
|
self.num_experts = num_experts
|
94
|
-
self.
|
96
|
+
self.num_fused_shared_experts = num_fused_shared_experts
|
97
|
+
self.expert_map_cpu = None
|
98
|
+
self.expert_map_gpu = None
|
95
99
|
|
96
100
|
if enable_flashinfer_cutlass_moe and quant_config is None:
|
97
101
|
logger.warning("Disable flashinfer MoE when quantization config is None.")
|
@@ -99,28 +103,27 @@ class FusedMoE(torch.nn.Module):
|
|
99
103
|
enable_ep_moe = False
|
100
104
|
|
101
105
|
self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
|
106
|
+
self.moe_ep_size = get_moe_expert_parallel_world_size()
|
107
|
+
self.moe_ep_rank = get_moe_expert_parallel_rank()
|
108
|
+
self.moe_tp_size = get_moe_tensor_parallel_world_size()
|
109
|
+
self.moe_tp_rank = get_moe_tensor_parallel_rank()
|
110
|
+
assert num_experts % self.moe_ep_size == 0
|
111
|
+
self.num_local_experts = num_experts // self.moe_ep_size
|
102
112
|
if enable_ep_moe:
|
103
|
-
|
104
|
-
self.ep_rank = self.tp_rank
|
105
|
-
self.tp_size = 1
|
106
|
-
self.tp_rank = 0
|
113
|
+
# TODO(ch-wan): support shared experts fusion
|
107
114
|
# Create a tensor of size num_experts filled with -1
|
108
|
-
self.
|
115
|
+
self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
|
109
116
|
# Create a expert map for the local experts
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
self.ep_rank
|
114
|
-
* self.num_local_experts : (self.ep_rank + 1)
|
117
|
+
self.expert_map_cpu[
|
118
|
+
self.moe_ep_rank
|
119
|
+
* self.num_local_experts : (self.moe_ep_rank + 1)
|
115
120
|
* self.num_local_experts
|
116
121
|
] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
|
117
|
-
|
118
|
-
|
119
|
-
self.ep_rank = 0
|
120
|
-
self.num_local_experts = num_experts
|
122
|
+
self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
|
123
|
+
|
121
124
|
self.routed_scaling_factor = routed_scaling_factor
|
122
|
-
assert intermediate_size % self.
|
123
|
-
self.intermediate_size_per_partition = intermediate_size // self.
|
125
|
+
assert intermediate_size % self.moe_tp_size == 0
|
126
|
+
self.intermediate_size_per_partition = intermediate_size // self.moe_tp_size
|
124
127
|
self.reduce_results = reduce_results
|
125
128
|
self.activation = activation
|
126
129
|
self.apply_router_weight_on_input = apply_router_weight_on_input
|
@@ -132,9 +135,6 @@ class FusedMoE(torch.nn.Module):
|
|
132
135
|
not _is_cpu and global_server_args_dict["enable_triton_kernel_moe"]
|
133
136
|
)
|
134
137
|
|
135
|
-
if skip_quant:
|
136
|
-
return
|
137
|
-
|
138
138
|
if quant_config is None:
|
139
139
|
self.quant_method: Optional[QuantizeMethodBase] = UnquantizedFusedMoEMethod(
|
140
140
|
self.use_triton_kernels
|
@@ -363,9 +363,9 @@ class FusedMoE(torch.nn.Module):
|
|
363
363
|
expert_data.copy_(loaded_weight)
|
364
364
|
|
365
365
|
def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
|
366
|
-
if self.
|
366
|
+
if self.expert_map_cpu is None:
|
367
367
|
return expert_id
|
368
|
-
return self.
|
368
|
+
return self.expert_map_cpu[expert_id].item()
|
369
369
|
|
370
370
|
def weight_loader(
|
371
371
|
self,
|
@@ -375,10 +375,48 @@ class FusedMoE(torch.nn.Module):
|
|
375
375
|
shard_id: str,
|
376
376
|
expert_id: int,
|
377
377
|
) -> None:
|
378
|
+
|
379
|
+
global_expert_location_metadata = get_global_expert_location_metadata()
|
380
|
+
if global_expert_location_metadata is None:
|
381
|
+
self._weight_loader_impl(
|
382
|
+
param=param,
|
383
|
+
loaded_weight=loaded_weight,
|
384
|
+
weight_name=weight_name,
|
385
|
+
shard_id=shard_id,
|
386
|
+
expert_id=expert_id,
|
387
|
+
)
|
388
|
+
return
|
389
|
+
|
390
|
+
if expert_id >= self.num_experts - self.num_fused_shared_experts:
|
391
|
+
# This is a shared expert.
|
392
|
+
physical_expert_ids = [expert_id]
|
393
|
+
else:
|
394
|
+
physical_expert_ids = (
|
395
|
+
global_expert_location_metadata.logical_to_all_physical(
|
396
|
+
self.layer_id, expert_id
|
397
|
+
)
|
398
|
+
)
|
399
|
+
|
400
|
+
for physical_expert_id in physical_expert_ids:
|
401
|
+
self._weight_loader_physical(
|
402
|
+
param=param,
|
403
|
+
loaded_weight=loaded_weight,
|
404
|
+
weight_name=weight_name,
|
405
|
+
shard_id=shard_id,
|
406
|
+
expert_id=physical_expert_id,
|
407
|
+
)
|
408
|
+
|
409
|
+
def _weight_loader_physical(
|
410
|
+
self,
|
411
|
+
param: torch.nn.Parameter,
|
412
|
+
loaded_weight: torch.Tensor,
|
413
|
+
weight_name: str,
|
414
|
+
shard_id: str,
|
415
|
+
expert_id: int,
|
416
|
+
) -> None:
|
378
417
|
expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
|
379
418
|
if expert_id == -1:
|
380
419
|
return
|
381
|
-
|
382
420
|
self._weight_loader_impl(
|
383
421
|
param=param,
|
384
422
|
loaded_weight=loaded_weight,
|
@@ -396,8 +434,7 @@ class FusedMoE(torch.nn.Module):
|
|
396
434
|
expert_id: int,
|
397
435
|
) -> None:
|
398
436
|
|
399
|
-
|
400
|
-
tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
|
437
|
+
tp_rank = self.moe_tp_rank
|
401
438
|
|
402
439
|
# compressed-tensors checkpoints with packed weights are stored flipped
|
403
440
|
# TODO (mgoin): check self.quant_method.quant_config.quant_format
|
@@ -571,9 +608,14 @@ class FusedMoE(torch.nn.Module):
|
|
571
608
|
)
|
572
609
|
return
|
573
610
|
|
574
|
-
def forward(self, hidden_states: torch.Tensor, topk_output:
|
611
|
+
def forward(self, hidden_states: torch.Tensor, topk_output: StandardTopKOutput):
|
575
612
|
assert self.quant_method is not None
|
576
613
|
|
614
|
+
if self.expert_map_gpu is not None:
|
615
|
+
topk_output = topk_output._replace(
|
616
|
+
topk_ids=self.expert_map_gpu[topk_output.topk_ids]
|
617
|
+
)
|
618
|
+
|
577
619
|
# Matrix multiply.
|
578
620
|
final_hidden_states = self.quant_method.apply(
|
579
621
|
layer=self,
|
@@ -584,17 +626,17 @@ class FusedMoE(torch.nn.Module):
|
|
584
626
|
routed_scaling_factor=self.routed_scaling_factor,
|
585
627
|
**(
|
586
628
|
dict(
|
587
|
-
tp_rank=self.
|
588
|
-
tp_size=self.
|
589
|
-
ep_rank=self.
|
590
|
-
ep_size=self.
|
629
|
+
tp_rank=self.moe_tp_rank,
|
630
|
+
tp_size=self.moe_tp_size,
|
631
|
+
ep_rank=self.moe_ep_rank,
|
632
|
+
ep_size=self.moe_ep_size,
|
591
633
|
)
|
592
634
|
if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
|
593
635
|
else {}
|
594
636
|
),
|
595
637
|
)
|
596
638
|
|
597
|
-
if self.reduce_results and (self.
|
639
|
+
if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
|
598
640
|
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
|
599
641
|
|
600
642
|
return final_hidden_states
|
@@ -627,3 +669,20 @@ class FusedMoE(torch.nn.Module):
|
|
627
669
|
("w3", ckpt_up_proj_name),
|
628
670
|
]
|
629
671
|
]
|
672
|
+
|
673
|
+
@classmethod
|
674
|
+
def make_expert_input_scale_params_mapping(
|
675
|
+
cls,
|
676
|
+
num_experts: int,
|
677
|
+
) -> List[Tuple[str, str, int, str]]:
|
678
|
+
# (param_name, weight_name, expert_id, shard_id)
|
679
|
+
return [
|
680
|
+
(
|
681
|
+
"experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
|
682
|
+
f"experts.{expert_id}.{shard_id}.",
|
683
|
+
expert_id,
|
684
|
+
shard_id,
|
685
|
+
)
|
686
|
+
for expert_id in range(num_experts)
|
687
|
+
for shard_id in ["w1", "w2", "w3"]
|
688
|
+
]
|
File without changes
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from enum import Enum, auto
|
5
|
+
from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable
|
6
|
+
|
7
|
+
import torch
|
8
|
+
|
9
|
+
|
10
|
+
class DispatchOutputFormat(Enum):
|
11
|
+
standard = auto()
|
12
|
+
deepep_normal = auto()
|
13
|
+
deepep_ll = auto()
|
14
|
+
|
15
|
+
def is_standard(self) -> bool:
|
16
|
+
return self == DispatchOutputFormat.standard
|
17
|
+
|
18
|
+
def is_deepep_normal(self) -> bool:
|
19
|
+
return self == DispatchOutputFormat.deepep_normal
|
20
|
+
|
21
|
+
def is_deepep_ll(self) -> bool:
|
22
|
+
return self == DispatchOutputFormat.deepep_ll
|
23
|
+
|
24
|
+
|
25
|
+
@runtime_checkable
|
26
|
+
class DispatchOutput(Protocol):
|
27
|
+
"""Protocol for dispatch outputs in different formats."""
|
28
|
+
|
29
|
+
@property
|
30
|
+
def format(self) -> DispatchOutputFormat: ...
|
31
|
+
|
32
|
+
|
33
|
+
class BaseDispatcherConfig(ABC):
|
34
|
+
"""Base class for dispatcher configs."""
|
35
|
+
|
36
|
+
pass
|
37
|
+
|
38
|
+
|
39
|
+
class BaseDispatcher(ABC):
|
40
|
+
"""Base class for dispatchers."""
|
41
|
+
|
42
|
+
@abstractmethod
|
43
|
+
def dispatch(self, *args, **kwargs) -> DispatchOutput:
|
44
|
+
pass
|
45
|
+
|
46
|
+
@abstractmethod
|
47
|
+
def combine(self, *args, **kwargs) -> torch.Tensor:
|
48
|
+
pass
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import NamedTuple
|
4
|
+
|
5
|
+
from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
|
6
|
+
DispatchOutput,
|
7
|
+
DispatchOutputFormat,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
class StandardDispatchOutput(NamedTuple):
|
12
|
+
"""Standard dispatch output."""
|
13
|
+
|
14
|
+
@property
|
15
|
+
def format(self) -> DispatchOutputFormat:
|
16
|
+
return DispatchOutputFormat.standard
|
17
|
+
|
18
|
+
|
19
|
+
assert isinstance(StandardDispatchOutput, DispatchOutput)
|
sglang/srt/layers/moe/topk.py
CHANGED
@@ -397,7 +397,9 @@ def grouped_topk_gpu(
|
|
397
397
|
.reshape(num_token, -1)
|
398
398
|
) # [n, e]
|
399
399
|
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
400
|
-
topk_weights, topk_ids = torch.topk(
|
400
|
+
topk_weights, topk_ids = torch.topk(
|
401
|
+
tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
|
402
|
+
)
|
401
403
|
if num_fused_shared_experts:
|
402
404
|
topk_ids[:, -1] = torch.randint(
|
403
405
|
low=num_experts,
|
@@ -486,7 +488,9 @@ def biased_grouped_topk_impl(
|
|
486
488
|
tmp_scores = scores_for_choice.masked_fill(
|
487
489
|
~score_mask.bool(), float("-inf")
|
488
490
|
) # [n, e]
|
489
|
-
_, topk_ids = torch.topk(
|
491
|
+
_, topk_ids = torch.topk(
|
492
|
+
tmp_scores, k=topk, dim=-1, sorted=num_fused_shared_experts > 0
|
493
|
+
)
|
490
494
|
topk_weights = scores.gather(1, topk_ids)
|
491
495
|
|
492
496
|
if num_fused_shared_experts:
|
@@ -172,7 +172,6 @@ class Fp8Config(QuantizationConfig):
|
|
172
172
|
self, layer: torch.nn.Module, prefix: str
|
173
173
|
) -> Optional[QuantizeMethodBase]:
|
174
174
|
from sglang.srt.layers.linear import LinearBase
|
175
|
-
from sglang.srt.layers.moe.ep_moe.layer import EPMoE
|
176
175
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
177
176
|
|
178
177
|
if isinstance(layer, LinearBase):
|
@@ -181,8 +180,6 @@ class Fp8Config(QuantizationConfig):
|
|
181
180
|
return Fp8LinearMethod(self)
|
182
181
|
elif isinstance(layer, FusedMoE):
|
183
182
|
return Fp8MoEMethod(self)
|
184
|
-
elif isinstance(layer, EPMoE):
|
185
|
-
return Fp8EPMoEMethod(self)
|
186
183
|
return None
|
187
184
|
|
188
185
|
def get_scaled_act_names(self) -> List[str]:
|
@@ -984,23 +981,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
|
984
981
|
no_combine: bool = False,
|
985
982
|
routed_scaling_factor: Optional[float] = None,
|
986
983
|
) -> torch.Tensor:
|
987
|
-
from sglang.srt.layers.moe.ep_moe.layer import EPMoE
|
988
984
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
|
989
985
|
|
990
|
-
if isinstance(layer, EPMoE):
|
991
|
-
layer.w13_weight_scale = (
|
992
|
-
layer.w13_weight_scale_inv
|
993
|
-
if self.block_quant
|
994
|
-
else layer.w13_weight_scale
|
995
|
-
)
|
996
|
-
layer.w2_weight_scale = (
|
997
|
-
layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
|
998
|
-
)
|
999
|
-
return layer.run_moe(
|
1000
|
-
hidden_states=x,
|
1001
|
-
topk_output=topk_output,
|
1002
|
-
)
|
1003
|
-
|
1004
986
|
if use_intel_amx_backend(layer):
|
1005
987
|
from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
|
1006
988
|
|
@@ -900,6 +900,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
|
900
900
|
layer.w13_blockscale_swizzled = Parameter(
|
901
901
|
w13_blockscale_swizzled, requires_grad=False
|
902
902
|
)
|
903
|
+
del layer.w13_weight_scale
|
903
904
|
|
904
905
|
# This is for quantization, so we need to invert it.
|
905
906
|
layer.w13_input_scale_quant = Parameter(
|
@@ -935,6 +936,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
|
935
936
|
layer.w2_blockscale_swizzled = Parameter(
|
936
937
|
w2_blockscale_swizzled, requires_grad=False
|
937
938
|
)
|
939
|
+
del layer.w2_weight_scale
|
938
940
|
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
|
939
941
|
|
940
942
|
device = layer.w13_weight.device
|
@@ -204,14 +204,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
204
204
|
routed_scaling_factor: Optional[float] = None,
|
205
205
|
) -> torch.Tensor:
|
206
206
|
|
207
|
-
from sglang.srt.layers.moe.ep_moe.layer import EPMoE
|
208
|
-
|
209
|
-
if isinstance(layer, EPMoE):
|
210
|
-
return layer.run_moe(
|
211
|
-
hidden_states=x,
|
212
|
-
topk_output=topk_output,
|
213
|
-
)
|
214
|
-
|
215
207
|
return self.forward(
|
216
208
|
x=x,
|
217
209
|
layer=layer,
|