sglang 0.4.9.post6__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +3 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +10 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +20 -640
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +2 -0
- sglang/srt/managers/io_struct.py +0 -2
- sglang/srt/managers/scheduler.py +89 -671
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +123 -74
- sglang/srt/managers/tp_worker.py +4 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +5 -0
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +2 -0
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +10 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/METADATA +3 -4
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/RECORD +69 -56
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
@@ -138,6 +138,7 @@ class BenchArgs:
|
|
138
138
|
def load_model(server_args, port_args, tp_rank):
|
139
139
|
suppress_other_loggers()
|
140
140
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
141
|
+
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
141
142
|
|
142
143
|
model_config = ModelConfig.from_server_args(server_args)
|
143
144
|
model_runner = ModelRunner(
|
@@ -146,6 +147,8 @@ def load_model(server_args, port_args, tp_rank):
|
|
146
147
|
gpu_id=tp_rank,
|
147
148
|
tp_rank=tp_rank,
|
148
149
|
tp_size=server_args.tp_size,
|
150
|
+
moe_ep_rank=moe_ep_rank,
|
151
|
+
moe_ep_size=server_args.ep_size,
|
149
152
|
pp_rank=0,
|
150
153
|
pp_size=1,
|
151
154
|
nccl_port=port_args.nccl_port,
|
sglang/srt/configs/__init__.py
CHANGED
@@ -5,6 +5,11 @@ from sglang.srt.configs.exaone import ExaoneConfig
|
|
5
5
|
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
6
6
|
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
7
7
|
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
8
|
+
from sglang.srt.configs.step3_vl import (
|
9
|
+
Step3TextConfig,
|
10
|
+
Step3VisionEncoderConfig,
|
11
|
+
Step3VLConfig,
|
12
|
+
)
|
8
13
|
|
9
14
|
__all__ = [
|
10
15
|
"ExaoneConfig",
|
@@ -14,4 +19,7 @@ __all__ = [
|
|
14
19
|
"MultiModalityConfig",
|
15
20
|
"KimiVLConfig",
|
16
21
|
"MoonViTConfig",
|
22
|
+
"Step3VLConfig",
|
23
|
+
"Step3TextConfig",
|
24
|
+
"Step3VisionEncoderConfig",
|
17
25
|
]
|
@@ -335,6 +335,8 @@ class ModelConfig:
|
|
335
335
|
"num_key_value_heads",
|
336
336
|
# For ChatGLM:
|
337
337
|
"multi_query_group_num",
|
338
|
+
# For Step3
|
339
|
+
"num_attention_groups",
|
338
340
|
]
|
339
341
|
for attr in attributes:
|
340
342
|
num_kv_heads = getattr(self.hf_text_config, attr, None)
|
@@ -644,6 +646,7 @@ multimodal_model_archs = [
|
|
644
646
|
"InternS1ForConditionalGeneration",
|
645
647
|
"Phi4MMForCausalLM",
|
646
648
|
"VILAForConditionalGeneration",
|
649
|
+
"Step3VLForConditionalGeneration",
|
647
650
|
]
|
648
651
|
|
649
652
|
|
@@ -0,0 +1,172 @@
|
|
1
|
+
from typing import Any, Optional, Union
|
2
|
+
|
3
|
+
from transformers.configuration_utils import PretrainedConfig
|
4
|
+
|
5
|
+
|
6
|
+
class Step3VisionEncoderConfig(PretrainedConfig):
|
7
|
+
model_type = "step3_vision_encoder"
|
8
|
+
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
hidden_size=1792,
|
12
|
+
intermediate_size=3072,
|
13
|
+
output_hidden_size=4096,
|
14
|
+
num_hidden_layers=63,
|
15
|
+
num_attention_heads=16,
|
16
|
+
num_channels=3,
|
17
|
+
image_size=728,
|
18
|
+
patch_size=14,
|
19
|
+
hidden_act="quick_gelu",
|
20
|
+
layer_norm_eps=1e-5,
|
21
|
+
**kwargs,
|
22
|
+
):
|
23
|
+
self.hidden_size = hidden_size
|
24
|
+
self.intermediate_size = intermediate_size
|
25
|
+
self.output_hidden_size = output_hidden_size
|
26
|
+
self.num_hidden_layers = num_hidden_layers
|
27
|
+
self.num_attention_heads = num_attention_heads
|
28
|
+
self.num_channels = num_channels
|
29
|
+
self.patch_size = patch_size
|
30
|
+
self.image_size = image_size
|
31
|
+
self.layer_norm_eps = layer_norm_eps
|
32
|
+
self.hidden_act = hidden_act
|
33
|
+
super().__init__(**kwargs)
|
34
|
+
|
35
|
+
|
36
|
+
class Step3TextConfig(PretrainedConfig):
|
37
|
+
model_type = "step3_text"
|
38
|
+
architectures = ["Step3TextForCausalLM"]
|
39
|
+
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
hidden_size: int = 7168,
|
43
|
+
intermediate_size: int = 18432,
|
44
|
+
num_attention_heads: int = 64,
|
45
|
+
num_attention_groups: int = 1,
|
46
|
+
num_hidden_layers: int = 61,
|
47
|
+
max_seq_len: int = 65536,
|
48
|
+
vocab_size: int = 128815,
|
49
|
+
rms_norm_eps: float = 1e-5,
|
50
|
+
moe_intermediate_size: int = 5120,
|
51
|
+
moe_num_experts: int = 48,
|
52
|
+
moe_top_k: int = 3,
|
53
|
+
rope_theta: float = 500000,
|
54
|
+
rope_scaling: Optional[dict[str, Any]] = None,
|
55
|
+
max_position_embedding: int = 65536,
|
56
|
+
share_expert_dim: int = 5120,
|
57
|
+
share_q_dim: int = 2048,
|
58
|
+
head_dim: int = 256,
|
59
|
+
norm_expert_weight: bool = False,
|
60
|
+
moe_layers_enum: tuple[int] = (
|
61
|
+
4,
|
62
|
+
5,
|
63
|
+
6,
|
64
|
+
7,
|
65
|
+
8,
|
66
|
+
9,
|
67
|
+
10,
|
68
|
+
11,
|
69
|
+
12,
|
70
|
+
13,
|
71
|
+
14,
|
72
|
+
15,
|
73
|
+
16,
|
74
|
+
17,
|
75
|
+
18,
|
76
|
+
19,
|
77
|
+
20,
|
78
|
+
21,
|
79
|
+
22,
|
80
|
+
23,
|
81
|
+
24,
|
82
|
+
25,
|
83
|
+
26,
|
84
|
+
27,
|
85
|
+
28,
|
86
|
+
29,
|
87
|
+
30,
|
88
|
+
31,
|
89
|
+
32,
|
90
|
+
33,
|
91
|
+
34,
|
92
|
+
35,
|
93
|
+
36,
|
94
|
+
37,
|
95
|
+
38,
|
96
|
+
39,
|
97
|
+
40,
|
98
|
+
41,
|
99
|
+
42,
|
100
|
+
43,
|
101
|
+
44,
|
102
|
+
45,
|
103
|
+
46,
|
104
|
+
47,
|
105
|
+
48,
|
106
|
+
49,
|
107
|
+
50,
|
108
|
+
51,
|
109
|
+
52,
|
110
|
+
53,
|
111
|
+
54,
|
112
|
+
55,
|
113
|
+
56,
|
114
|
+
57,
|
115
|
+
58,
|
116
|
+
59,
|
117
|
+
),
|
118
|
+
**kwargs,
|
119
|
+
) -> None:
|
120
|
+
self.hidden_size = hidden_size
|
121
|
+
self.intermediate_size = intermediate_size
|
122
|
+
self.num_attention_heads = num_attention_heads
|
123
|
+
self.num_attention_groups = num_attention_groups
|
124
|
+
self.num_hidden_layers = num_hidden_layers
|
125
|
+
self.max_seq_len = max_seq_len
|
126
|
+
self.vocab_size = vocab_size
|
127
|
+
self.rms_norm_eps = rms_norm_eps
|
128
|
+
self.moe_intermediate_size = moe_intermediate_size
|
129
|
+
self.moe_num_experts = moe_num_experts
|
130
|
+
self.moe_top_k = moe_top_k
|
131
|
+
self.rope_theta = rope_theta
|
132
|
+
self.rope_scaling = rope_scaling
|
133
|
+
self.max_position_embedding = max_position_embedding
|
134
|
+
self.share_expert_dim = share_expert_dim
|
135
|
+
self.share_q_dim = share_q_dim
|
136
|
+
self.head_dim = head_dim
|
137
|
+
self.norm_expert_weight = norm_expert_weight
|
138
|
+
self.moe_layers_enum = moe_layers_enum
|
139
|
+
|
140
|
+
super().__init__(**kwargs)
|
141
|
+
|
142
|
+
|
143
|
+
class Step3VLConfig(PretrainedConfig):
|
144
|
+
model_type = "step3_vl"
|
145
|
+
|
146
|
+
def __init__(
|
147
|
+
self,
|
148
|
+
vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
|
149
|
+
text_config: Optional[Union[dict, Step3TextConfig]] = None,
|
150
|
+
understand_projector_stride: int = 1,
|
151
|
+
projector_bias: bool = True,
|
152
|
+
image_token_id: int = 128001,
|
153
|
+
**kwargs,
|
154
|
+
) -> None:
|
155
|
+
if vision_config is None:
|
156
|
+
vision_config = Step3VisionEncoderConfig()
|
157
|
+
elif isinstance(vision_config, dict):
|
158
|
+
vision_config = Step3VisionEncoderConfig(**vision_config)
|
159
|
+
self.vision_config = vision_config
|
160
|
+
|
161
|
+
if text_config is None:
|
162
|
+
text_config = Step3TextConfig()
|
163
|
+
elif isinstance(text_config, dict):
|
164
|
+
text_config = Step3TextConfig(**text_config)
|
165
|
+
self.text_config = text_config
|
166
|
+
|
167
|
+
self.understand_projector_stride = understand_projector_stride
|
168
|
+
self.projector_bias = projector_bias
|
169
|
+
self.hidden_size = text_config.hidden_size
|
170
|
+
self.image_token_id = image_token_id
|
171
|
+
|
172
|
+
super().__init__(**kwargs)
|
sglang/srt/conversation.py
CHANGED
@@ -994,6 +994,23 @@ register_conv_template(
|
|
994
994
|
)
|
995
995
|
)
|
996
996
|
|
997
|
+
register_conv_template(
|
998
|
+
Conversation(
|
999
|
+
name="step3-vl",
|
1000
|
+
system_message="<|begin▁of▁sentence|>You are a helpful assistant",
|
1001
|
+
system_template="{system_message}\n",
|
1002
|
+
roles=(
|
1003
|
+
"<|BOT|>user\n",
|
1004
|
+
"<|BOT|>assistant\n<think>\n",
|
1005
|
+
),
|
1006
|
+
sep="<|EOT|>",
|
1007
|
+
sep_style=SeparatorStyle.NO_COLON_SINGLE,
|
1008
|
+
stop_str="<|EOT|>",
|
1009
|
+
image_token="<im_patch>",
|
1010
|
+
# add_bos=True,
|
1011
|
+
)
|
1012
|
+
)
|
1013
|
+
|
997
1014
|
|
998
1015
|
@register_conv_template_matching_function
|
999
1016
|
def match_internvl(model_path: str):
|
@@ -1103,3 +1120,9 @@ def match_vila(model_path: str):
|
|
1103
1120
|
def match_mimo_vl(model_path: str):
|
1104
1121
|
if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
|
1105
1122
|
return "mimo-vl"
|
1123
|
+
|
1124
|
+
|
1125
|
+
# @register_conv_template_matching_function
|
1126
|
+
# def match_step3(model_path: str):
|
1127
|
+
# if re.search(r"step3", model_path, re.IGNORECASE):
|
1128
|
+
# return "step3-vl"
|
@@ -694,10 +694,7 @@ class SchedulerDisaggregationDecodeMixin:
|
|
694
694
|
+ len(self.disagg_decode_prealloc_queue.queue)
|
695
695
|
== 0
|
696
696
|
):
|
697
|
-
|
698
|
-
self.check_memory()
|
699
|
-
self.new_token_ratio = self.init_new_token_ratio
|
700
|
-
self.maybe_sleep_on_idle()
|
697
|
+
self.self_check_during_idle()
|
701
698
|
|
702
699
|
self.last_batch = batch
|
703
700
|
|
@@ -771,10 +768,7 @@ class SchedulerDisaggregationDecodeMixin:
|
|
771
768
|
+ len(self.disagg_decode_prealloc_queue.queue)
|
772
769
|
== 0
|
773
770
|
):
|
774
|
-
|
775
|
-
self.check_memory()
|
776
|
-
self.new_token_ratio = self.init_new_token_ratio
|
777
|
-
self.maybe_sleep_on_idle()
|
771
|
+
self.self_check_during_idle()
|
778
772
|
|
779
773
|
self.last_batch = batch
|
780
774
|
self.last_batch_in_queue = last_batch_in_queue
|
@@ -287,9 +287,7 @@ class SchedulerDisaggregationPrefillMixin:
|
|
287
287
|
self.process_disagg_prefill_inflight_queue()
|
288
288
|
|
289
289
|
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
|
290
|
-
self.
|
291
|
-
self.new_token_ratio = self.init_new_token_ratio
|
292
|
-
self.maybe_sleep_on_idle()
|
290
|
+
self.self_check_during_idle()
|
293
291
|
|
294
292
|
self.last_batch = batch
|
295
293
|
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
|
@@ -337,9 +335,7 @@ class SchedulerDisaggregationPrefillMixin:
|
|
337
335
|
self.process_disagg_prefill_inflight_queue()
|
338
336
|
|
339
337
|
if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
|
340
|
-
self.
|
341
|
-
self.new_token_ratio = self.init_new_token_ratio
|
342
|
-
self.maybe_sleep_on_idle()
|
338
|
+
self.self_check_during_idle()
|
343
339
|
|
344
340
|
self.last_batch = batch
|
345
341
|
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
|
@@ -354,6 +354,13 @@ class GroupCoordinator:
|
|
354
354
|
self.cpu_group, 1 << 22, 6
|
355
355
|
)
|
356
356
|
|
357
|
+
def __repr__(self):
|
358
|
+
return (
|
359
|
+
f"ranks={self.ranks} rank={self.rank} local_rank={self.local_rank} use_pynccl={self.use_pynccl} "
|
360
|
+
f"device_group={self.device_group} cpu_group={self.cpu_group} unique_name={self.unique_name} "
|
361
|
+
f"world_size={self.world_size} rank_in_group={self.rank_in_group}"
|
362
|
+
)
|
363
|
+
|
357
364
|
@property
|
358
365
|
def first_rank(self):
|
359
366
|
"""Return the global rank of the first process in the group"""
|
@@ -1141,6 +1148,20 @@ def get_tp_group() -> GroupCoordinator:
|
|
1141
1148
|
return _TP
|
1142
1149
|
|
1143
1150
|
|
1151
|
+
_MOE_EP: Optional[GroupCoordinator] = None
|
1152
|
+
_MOE_TP: Optional[GroupCoordinator] = None
|
1153
|
+
|
1154
|
+
|
1155
|
+
def get_moe_ep_group() -> GroupCoordinator:
|
1156
|
+
assert _MOE_EP is not None, "expert model parallel group is not initialized"
|
1157
|
+
return _MOE_EP
|
1158
|
+
|
1159
|
+
|
1160
|
+
def get_moe_tp_group() -> GroupCoordinator:
|
1161
|
+
assert _MOE_TP is not None, "expert model parallel group is not initialized"
|
1162
|
+
return _MOE_TP
|
1163
|
+
|
1164
|
+
|
1144
1165
|
# kept for backward compatibility
|
1145
1166
|
get_tensor_model_parallel_group = get_tp_group
|
1146
1167
|
|
@@ -1250,6 +1271,7 @@ def init_distributed_environment(
|
|
1250
1271
|
|
1251
1272
|
def initialize_model_parallel(
|
1252
1273
|
tensor_model_parallel_size: int = 1,
|
1274
|
+
expert_model_parallel_size: int = 1,
|
1253
1275
|
pipeline_model_parallel_size: int = 1,
|
1254
1276
|
backend: Optional[str] = None,
|
1255
1277
|
duplicate_tp_group: bool = False,
|
@@ -1327,6 +1349,45 @@ def initialize_model_parallel(
|
|
1327
1349
|
_TP.pynccl_comm.disabled = False
|
1328
1350
|
_PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
|
1329
1351
|
|
1352
|
+
moe_ep_size = expert_model_parallel_size
|
1353
|
+
|
1354
|
+
moe_tp_size = tensor_model_parallel_size // moe_ep_size
|
1355
|
+
global _MOE_EP
|
1356
|
+
assert _MOE_EP is None, "expert model parallel group is already initialized"
|
1357
|
+
group_ranks = []
|
1358
|
+
for i in range(num_tensor_model_parallel_groups):
|
1359
|
+
for j in range(moe_tp_size):
|
1360
|
+
st = i * tensor_model_parallel_size + j
|
1361
|
+
en = (i + 1) * tensor_model_parallel_size + j
|
1362
|
+
ranks = list(range(st, en, moe_tp_size))
|
1363
|
+
group_ranks.append(ranks)
|
1364
|
+
|
1365
|
+
_MOE_EP = init_model_parallel_group(
|
1366
|
+
group_ranks,
|
1367
|
+
get_world_group().local_rank,
|
1368
|
+
backend,
|
1369
|
+
use_custom_allreduce=False,
|
1370
|
+
group_name="moe_ep",
|
1371
|
+
)
|
1372
|
+
|
1373
|
+
global _MOE_TP
|
1374
|
+
assert _MOE_TP is None, "expert model parallel group is already initialized"
|
1375
|
+
group_ranks = []
|
1376
|
+
for i in range(num_tensor_model_parallel_groups):
|
1377
|
+
for j in range(moe_ep_size):
|
1378
|
+
st = i * tensor_model_parallel_size + j * moe_tp_size
|
1379
|
+
en = i * tensor_model_parallel_size + (j + 1) * moe_tp_size
|
1380
|
+
ranks = list(range(st, en))
|
1381
|
+
group_ranks.append(ranks)
|
1382
|
+
|
1383
|
+
_MOE_TP = init_model_parallel_group(
|
1384
|
+
group_ranks,
|
1385
|
+
get_world_group().local_rank,
|
1386
|
+
backend,
|
1387
|
+
use_custom_allreduce=False,
|
1388
|
+
group_name="moe_tp",
|
1389
|
+
)
|
1390
|
+
|
1330
1391
|
# Build the pipeline model-parallel groups.
|
1331
1392
|
num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
|
1332
1393
|
global _PP
|
@@ -1347,6 +1408,7 @@ def initialize_model_parallel(
|
|
1347
1408
|
|
1348
1409
|
def ensure_model_parallel_initialized(
|
1349
1410
|
tensor_model_parallel_size: int,
|
1411
|
+
expert_model_parallel_size: int,
|
1350
1412
|
pipeline_model_parallel_size: int,
|
1351
1413
|
backend: Optional[str] = None,
|
1352
1414
|
) -> None:
|
@@ -1357,7 +1419,10 @@ def ensure_model_parallel_initialized(
|
|
1357
1419
|
backend = backend or torch.distributed.get_backend(get_world_group().device_group)
|
1358
1420
|
if not model_parallel_is_initialized():
|
1359
1421
|
initialize_model_parallel(
|
1360
|
-
tensor_model_parallel_size,
|
1422
|
+
tensor_model_parallel_size,
|
1423
|
+
expert_model_parallel_size,
|
1424
|
+
pipeline_model_parallel_size,
|
1425
|
+
backend,
|
1361
1426
|
)
|
1362
1427
|
return
|
1363
1428
|
|
@@ -1417,6 +1482,26 @@ def get_tensor_model_parallel_rank():
|
|
1417
1482
|
return get_tp_group().rank_in_group
|
1418
1483
|
|
1419
1484
|
|
1485
|
+
def get_moe_expert_parallel_world_size():
|
1486
|
+
"""Return world size for the moe expert parallel group."""
|
1487
|
+
return get_moe_ep_group().world_size
|
1488
|
+
|
1489
|
+
|
1490
|
+
def get_moe_expert_parallel_rank():
|
1491
|
+
"""Return my rank for the moe expert parallel group."""
|
1492
|
+
return get_moe_ep_group().rank_in_group
|
1493
|
+
|
1494
|
+
|
1495
|
+
def get_moe_tensor_parallel_world_size():
|
1496
|
+
"""Return world size for the moe tensor parallel group."""
|
1497
|
+
return get_moe_tp_group().world_size
|
1498
|
+
|
1499
|
+
|
1500
|
+
def get_moe_tensor_parallel_rank():
|
1501
|
+
"""Return my rank for the moe tensor parallel group."""
|
1502
|
+
return get_moe_tp_group().rank_in_group
|
1503
|
+
|
1504
|
+
|
1420
1505
|
def destroy_model_parallel():
|
1421
1506
|
"""Set the groups to none and destroy them."""
|
1422
1507
|
global _TP
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -648,29 +648,23 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
648
648
|
if _is_cuda:
|
649
649
|
assert_pkg_version(
|
650
650
|
"sgl-kernel",
|
651
|
-
"0.2.
|
651
|
+
"0.2.8",
|
652
652
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
653
653
|
)
|
654
654
|
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
655
|
+
if True: # Keep this check for internal code compatibility
|
656
|
+
# Register the signal handler.
|
657
|
+
# The child processes will send SIGQUIT to this process when any error happens
|
658
|
+
# This process then clean up the whole process tree
|
659
|
+
# Note: This sigquit handler is used in the launch phase, and may be replaced by
|
660
|
+
# the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
|
661
|
+
def launch_phase_sigquit_handler(signum, frame):
|
662
|
+
logger.error(
|
663
|
+
"Received sigquit from a child process. It usually means the child failed."
|
660
664
|
)
|
665
|
+
kill_process_tree(os.getpid())
|
661
666
|
|
662
|
-
|
663
|
-
|
664
|
-
# Register the signal handler.
|
665
|
-
# The child processes will send SIGQUIT to this process when any error happens
|
666
|
-
# This process then clean up the whole process tree
|
667
|
-
def sigquit_handler(signum, frame):
|
668
|
-
logger.error(
|
669
|
-
"Received sigquit from a child process. It usually means the child failed."
|
670
|
-
)
|
671
|
-
kill_process_tree(os.getpid())
|
672
|
-
|
673
|
-
signal.signal(signal.SIGQUIT, sigquit_handler)
|
667
|
+
signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
|
674
668
|
|
675
669
|
# Set mp start method
|
676
670
|
mp.set_start_method("spawn", force=True)
|
@@ -725,6 +719,7 @@ def _launch_subprocesses(
|
|
725
719
|
+ ((pp_rank % pp_size_per_node) * tp_size_per_node)
|
726
720
|
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
727
721
|
)
|
722
|
+
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
728
723
|
proc = mp.Process(
|
729
724
|
target=run_scheduler_process,
|
730
725
|
args=(
|
@@ -732,6 +727,7 @@ def _launch_subprocesses(
|
|
732
727
|
port_args,
|
733
728
|
gpu_id,
|
734
729
|
tp_rank,
|
730
|
+
moe_ep_rank,
|
735
731
|
pp_rank,
|
736
732
|
None,
|
737
733
|
writer,
|
@@ -238,6 +238,9 @@ async def health() -> Response:
|
|
238
238
|
@app.get("/health_generate")
|
239
239
|
async def health_generate(request: Request) -> Response:
|
240
240
|
"""Check the health of the inference server by generating one token."""
|
241
|
+
if _global_state.tokenizer_manager.gracefully_exit:
|
242
|
+
logger.info("Health check request received during shutdown. Returning 503.")
|
243
|
+
return Response(status_code=503)
|
241
244
|
|
242
245
|
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
|
243
246
|
rid = f"HEALTH_CHECK_{time.time()}"
|
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
|
|
260
263
|
async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
|
261
264
|
break
|
262
265
|
|
263
|
-
|
266
|
+
# This request is a special request.
|
267
|
+
# If the server already has something running, this request will be ignored, so it creates zero overhead.
|
268
|
+
# If the server is not running, this request will be run, so we know whether the server is healthy.
|
264
269
|
task = asyncio.create_task(gen())
|
265
|
-
|
270
|
+
|
271
|
+
# As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
|
272
|
+
tic = time.time()
|
273
|
+
while time.time() < tic + HEALTH_CHECK_TIMEOUT:
|
266
274
|
await asyncio.sleep(1)
|
267
275
|
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
|
268
276
|
task.cancel()
|
@@ -127,12 +127,12 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
127
127
|
request.skip_special_tokens = False
|
128
128
|
if not isinstance(request.tool_choice, str):
|
129
129
|
tools = [
|
130
|
-
item.model_dump()
|
130
|
+
item.function.model_dump()
|
131
131
|
for item in request.tools
|
132
132
|
if item.function.name == request.tool_choice.function.name
|
133
133
|
]
|
134
134
|
else:
|
135
|
-
tools = [item.model_dump() for item in request.tools]
|
135
|
+
tools = [item.function.model_dump() for item in request.tools]
|
136
136
|
|
137
137
|
tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
138
138
|
parser = FunctionCallParser(request.tools, tool_call_parser)
|
@@ -178,25 +178,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
178
178
|
audio_data,
|
179
179
|
modalities,
|
180
180
|
)
|
181
|
-
|
182
|
-
if "tool_calls" in processed_msg and isinstance(
|
183
|
-
processed_msg.get("tool_calls"), list
|
184
|
-
):
|
185
|
-
for call in processed_msg["tool_calls"]:
|
186
|
-
try:
|
187
|
-
if "arguments" in call["function"] and isinstance(
|
188
|
-
call["function"]["arguments"], str
|
189
|
-
):
|
190
|
-
call["function"]["arguments"] = json.loads(
|
191
|
-
call["function"]["arguments"]
|
192
|
-
)
|
193
|
-
except json.JSONDecodeError as e:
|
194
|
-
# Log a warning or error if JSON parsing fails for arguments
|
195
|
-
logger.warning(
|
196
|
-
f"Failed to parse tool call arguments as JSON: {e}"
|
197
|
-
)
|
198
|
-
# Decide whether to continue or raise the exception based on desired behavior
|
199
|
-
continue # Or raise e if strict parsing is required
|
200
181
|
openai_compatible_messages.append(processed_msg)
|
201
182
|
|
202
183
|
# Handle assistant prefix for continue_final_message
|
@@ -47,6 +47,11 @@ class ExpertDistributionRecorder(ABC):
|
|
47
47
|
rank: int,
|
48
48
|
):
|
49
49
|
if server_args.expert_distribution_recorder_mode is not None:
|
50
|
+
assert (
|
51
|
+
expert_location_metadata is not None
|
52
|
+
), "ExpertLocationMetadata is required for expert distribution recording. One possible"
|
53
|
+
"reason is that you are using a model that does not support expert distribution"
|
54
|
+
"recording. Try setting `get_model_config_for_expert_location` in your model."
|
50
55
|
return _ExpertDistributionRecorderReal(
|
51
56
|
server_args, expert_location_metadata, rank
|
52
57
|
)
|
@@ -82,6 +82,10 @@ class ExpertLocationMetadata:
|
|
82
82
|
def init_trivial(server_args: ServerArgs, model_config: ModelConfig):
|
83
83
|
"""Trivial location - logical expert i corresponds to physical expert i"""
|
84
84
|
common = ExpertLocationMetadata._init_common(server_args, model_config)
|
85
|
+
|
86
|
+
if common is None:
|
87
|
+
return None
|
88
|
+
|
85
89
|
num_physical_experts = common["num_physical_experts"]
|
86
90
|
model_config_for_expert_location = common["model_config_for_expert_location"]
|
87
91
|
num_layers = model_config_for_expert_location.num_layers
|
@@ -109,6 +113,10 @@ class ExpertLocationMetadata:
|
|
109
113
|
physical_to_logical_map = physical_to_logical_map.to(server_args.device)
|
110
114
|
|
111
115
|
common = ExpertLocationMetadata._init_common(server_args, model_config)
|
116
|
+
|
117
|
+
if common is None:
|
118
|
+
return None
|
119
|
+
|
112
120
|
model_config_for_expert_location = common["model_config_for_expert_location"]
|
113
121
|
logical_to_all_physical_map = _compute_logical_to_all_physical_map(
|
114
122
|
physical_to_logical_map,
|
@@ -133,6 +141,10 @@ class ExpertLocationMetadata:
|
|
133
141
|
logical_count = logical_count.to(server_args.device)
|
134
142
|
|
135
143
|
common = ExpertLocationMetadata._init_common(server_args, model_config)
|
144
|
+
|
145
|
+
if common is None:
|
146
|
+
return None
|
147
|
+
|
136
148
|
model_config_for_expert_location = common["model_config_for_expert_location"]
|
137
149
|
num_physical_experts = common["num_physical_experts"]
|
138
150
|
num_groups = model_config_for_expert_location.num_groups
|
@@ -168,6 +180,9 @@ class ExpertLocationMetadata:
|
|
168
180
|
ModelConfigForExpertLocation.from_model_config(model_config)
|
169
181
|
)
|
170
182
|
|
183
|
+
if model_config_for_expert_location is None:
|
184
|
+
return None
|
185
|
+
|
171
186
|
num_physical_experts = (
|
172
187
|
model_config_for_expert_location.num_logical_experts
|
173
188
|
+ server_args.ep_num_redundant_experts
|
@@ -398,10 +413,6 @@ class ModelConfigForExpertLocation:
|
|
398
413
|
num_logical_experts: int
|
399
414
|
num_groups: Optional[int] = None
|
400
415
|
|
401
|
-
@staticmethod
|
402
|
-
def init_dummy():
|
403
|
-
return ModelConfigForExpertLocation(num_layers=1, num_logical_experts=1)
|
404
|
-
|
405
416
|
@staticmethod
|
406
417
|
def from_model_config(model_config: ModelConfig):
|
407
418
|
model_class, _ = get_model_architecture(model_config)
|
@@ -410,12 +421,12 @@ class ModelConfigForExpertLocation:
|
|
410
421
|
model_config.hf_config
|
411
422
|
)
|
412
423
|
else:
|
413
|
-
return
|
424
|
+
return None
|
414
425
|
|
415
426
|
|
416
427
|
def compute_initial_expert_location_metadata(
|
417
428
|
server_args: ServerArgs, model_config: ModelConfig
|
418
|
-
) -> ExpertLocationMetadata:
|
429
|
+
) -> Optional[ExpertLocationMetadata]:
|
419
430
|
data = server_args.init_expert_location
|
420
431
|
if data == "trivial":
|
421
432
|
return ExpertLocationMetadata.init_trivial(server_args, model_config)
|
@@ -36,6 +36,7 @@ class ExpertLocationDispatchInfo:
|
|
36
36
|
def init_new(cls, layer_id: int):
|
37
37
|
ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
|
38
38
|
expert_location_metadata = get_global_expert_location_metadata()
|
39
|
+
assert expert_location_metadata is not None
|
39
40
|
|
40
41
|
if ep_dispatch_algorithm is None:
|
41
42
|
return None
|
@@ -50,6 +50,8 @@ class ExpertLocationUpdater:
|
|
50
50
|
torch.cuda.empty_cache()
|
51
51
|
|
52
52
|
old_expert_location_metadata = get_global_expert_location_metadata()
|
53
|
+
assert old_expert_location_metadata is not None
|
54
|
+
|
53
55
|
_update_expert_weights(
|
54
56
|
routed_experts_weights_of_layer=routed_experts_weights_of_layer,
|
55
57
|
old_expert_location_metadata=old_expert_location_metadata,
|