sglang 0.4.9.post6__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +3 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +10 -2
  11. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  12. sglang/srt/eplb/expert_distribution.py +5 -0
  13. sglang/srt/eplb/expert_location.py +17 -6
  14. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  15. sglang/srt/eplb/expert_location_updater.py +2 -0
  16. sglang/srt/function_call/function_call_parser.py +2 -0
  17. sglang/srt/function_call/step3_detector.py +436 -0
  18. sglang/srt/hf_transformers_utils.py +2 -0
  19. sglang/srt/jinja_template_utils.py +4 -1
  20. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  21. sglang/srt/layers/moe/ep_moe/layer.py +20 -640
  22. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  23. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  24. sglang/srt/layers/quantization/fp8.py +0 -18
  25. sglang/srt/layers/quantization/unquant.py +0 -8
  26. sglang/srt/layers/quantization/w4afp8.py +1 -0
  27. sglang/srt/managers/cache_controller.py +143 -45
  28. sglang/srt/managers/data_parallel_controller.py +2 -0
  29. sglang/srt/managers/io_struct.py +0 -2
  30. sglang/srt/managers/scheduler.py +89 -671
  31. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  32. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  33. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  34. sglang/srt/managers/template_manager.py +62 -19
  35. sglang/srt/managers/tokenizer_manager.py +123 -74
  36. sglang/srt/managers/tp_worker.py +4 -0
  37. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  38. sglang/srt/mem_cache/hicache_storage.py +45 -11
  39. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  40. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  41. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  42. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  43. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  44. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  45. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  46. sglang/srt/model_executor/model_runner.py +5 -0
  47. sglang/srt/models/arcee.py +532 -0
  48. sglang/srt/models/deepseek_v2.py +2 -0
  49. sglang/srt/models/glm4_moe.py +3 -1
  50. sglang/srt/models/granitemoe.py +3 -0
  51. sglang/srt/models/grok.py +3 -0
  52. sglang/srt/models/hunyuan.py +1 -0
  53. sglang/srt/models/llama4.py +3 -0
  54. sglang/srt/models/mixtral.py +3 -0
  55. sglang/srt/models/olmoe.py +3 -0
  56. sglang/srt/models/phimoe.py +1 -0
  57. sglang/srt/models/step3_vl.py +994 -0
  58. sglang/srt/multimodal/processors/base_processor.py +15 -16
  59. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  60. sglang/srt/reasoning_parser.py +2 -1
  61. sglang/srt/server_args.py +10 -13
  62. sglang/srt/speculative/eagle_worker.py +2 -0
  63. sglang/utils.py +0 -11
  64. sglang/version.py +1 -1
  65. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/METADATA +3 -4
  66. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/RECORD +69 -56
  67. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  68. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  69. {sglang-0.4.9.post6.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -138,6 +138,7 @@ class BenchArgs:
138
138
  def load_model(server_args, port_args, tp_rank):
139
139
  suppress_other_loggers()
140
140
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
141
+ moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
141
142
 
142
143
  model_config = ModelConfig.from_server_args(server_args)
143
144
  model_runner = ModelRunner(
@@ -146,6 +147,8 @@ def load_model(server_args, port_args, tp_rank):
146
147
  gpu_id=tp_rank,
147
148
  tp_rank=tp_rank,
148
149
  tp_size=server_args.tp_size,
150
+ moe_ep_rank=moe_ep_rank,
151
+ moe_ep_size=server_args.ep_size,
149
152
  pp_rank=0,
150
153
  pp_size=1,
151
154
  nccl_port=port_args.nccl_port,
@@ -5,6 +5,11 @@ from sglang.srt.configs.exaone import ExaoneConfig
5
5
  from sglang.srt.configs.janus_pro import MultiModalityConfig
6
6
  from sglang.srt.configs.kimi_vl import KimiVLConfig
7
7
  from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
8
+ from sglang.srt.configs.step3_vl import (
9
+ Step3TextConfig,
10
+ Step3VisionEncoderConfig,
11
+ Step3VLConfig,
12
+ )
8
13
 
9
14
  __all__ = [
10
15
  "ExaoneConfig",
@@ -14,4 +19,7 @@ __all__ = [
14
19
  "MultiModalityConfig",
15
20
  "KimiVLConfig",
16
21
  "MoonViTConfig",
22
+ "Step3VLConfig",
23
+ "Step3TextConfig",
24
+ "Step3VisionEncoderConfig",
17
25
  ]
@@ -335,6 +335,8 @@ class ModelConfig:
335
335
  "num_key_value_heads",
336
336
  # For ChatGLM:
337
337
  "multi_query_group_num",
338
+ # For Step3
339
+ "num_attention_groups",
338
340
  ]
339
341
  for attr in attributes:
340
342
  num_kv_heads = getattr(self.hf_text_config, attr, None)
@@ -644,6 +646,7 @@ multimodal_model_archs = [
644
646
  "InternS1ForConditionalGeneration",
645
647
  "Phi4MMForCausalLM",
646
648
  "VILAForConditionalGeneration",
649
+ "Step3VLForConditionalGeneration",
647
650
  ]
648
651
 
649
652
 
@@ -0,0 +1,172 @@
1
+ from typing import Any, Optional, Union
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class Step3VisionEncoderConfig(PretrainedConfig):
7
+ model_type = "step3_vision_encoder"
8
+
9
+ def __init__(
10
+ self,
11
+ hidden_size=1792,
12
+ intermediate_size=3072,
13
+ output_hidden_size=4096,
14
+ num_hidden_layers=63,
15
+ num_attention_heads=16,
16
+ num_channels=3,
17
+ image_size=728,
18
+ patch_size=14,
19
+ hidden_act="quick_gelu",
20
+ layer_norm_eps=1e-5,
21
+ **kwargs,
22
+ ):
23
+ self.hidden_size = hidden_size
24
+ self.intermediate_size = intermediate_size
25
+ self.output_hidden_size = output_hidden_size
26
+ self.num_hidden_layers = num_hidden_layers
27
+ self.num_attention_heads = num_attention_heads
28
+ self.num_channels = num_channels
29
+ self.patch_size = patch_size
30
+ self.image_size = image_size
31
+ self.layer_norm_eps = layer_norm_eps
32
+ self.hidden_act = hidden_act
33
+ super().__init__(**kwargs)
34
+
35
+
36
+ class Step3TextConfig(PretrainedConfig):
37
+ model_type = "step3_text"
38
+ architectures = ["Step3TextForCausalLM"]
39
+
40
+ def __init__(
41
+ self,
42
+ hidden_size: int = 7168,
43
+ intermediate_size: int = 18432,
44
+ num_attention_heads: int = 64,
45
+ num_attention_groups: int = 1,
46
+ num_hidden_layers: int = 61,
47
+ max_seq_len: int = 65536,
48
+ vocab_size: int = 128815,
49
+ rms_norm_eps: float = 1e-5,
50
+ moe_intermediate_size: int = 5120,
51
+ moe_num_experts: int = 48,
52
+ moe_top_k: int = 3,
53
+ rope_theta: float = 500000,
54
+ rope_scaling: Optional[dict[str, Any]] = None,
55
+ max_position_embedding: int = 65536,
56
+ share_expert_dim: int = 5120,
57
+ share_q_dim: int = 2048,
58
+ head_dim: int = 256,
59
+ norm_expert_weight: bool = False,
60
+ moe_layers_enum: tuple[int] = (
61
+ 4,
62
+ 5,
63
+ 6,
64
+ 7,
65
+ 8,
66
+ 9,
67
+ 10,
68
+ 11,
69
+ 12,
70
+ 13,
71
+ 14,
72
+ 15,
73
+ 16,
74
+ 17,
75
+ 18,
76
+ 19,
77
+ 20,
78
+ 21,
79
+ 22,
80
+ 23,
81
+ 24,
82
+ 25,
83
+ 26,
84
+ 27,
85
+ 28,
86
+ 29,
87
+ 30,
88
+ 31,
89
+ 32,
90
+ 33,
91
+ 34,
92
+ 35,
93
+ 36,
94
+ 37,
95
+ 38,
96
+ 39,
97
+ 40,
98
+ 41,
99
+ 42,
100
+ 43,
101
+ 44,
102
+ 45,
103
+ 46,
104
+ 47,
105
+ 48,
106
+ 49,
107
+ 50,
108
+ 51,
109
+ 52,
110
+ 53,
111
+ 54,
112
+ 55,
113
+ 56,
114
+ 57,
115
+ 58,
116
+ 59,
117
+ ),
118
+ **kwargs,
119
+ ) -> None:
120
+ self.hidden_size = hidden_size
121
+ self.intermediate_size = intermediate_size
122
+ self.num_attention_heads = num_attention_heads
123
+ self.num_attention_groups = num_attention_groups
124
+ self.num_hidden_layers = num_hidden_layers
125
+ self.max_seq_len = max_seq_len
126
+ self.vocab_size = vocab_size
127
+ self.rms_norm_eps = rms_norm_eps
128
+ self.moe_intermediate_size = moe_intermediate_size
129
+ self.moe_num_experts = moe_num_experts
130
+ self.moe_top_k = moe_top_k
131
+ self.rope_theta = rope_theta
132
+ self.rope_scaling = rope_scaling
133
+ self.max_position_embedding = max_position_embedding
134
+ self.share_expert_dim = share_expert_dim
135
+ self.share_q_dim = share_q_dim
136
+ self.head_dim = head_dim
137
+ self.norm_expert_weight = norm_expert_weight
138
+ self.moe_layers_enum = moe_layers_enum
139
+
140
+ super().__init__(**kwargs)
141
+
142
+
143
+ class Step3VLConfig(PretrainedConfig):
144
+ model_type = "step3_vl"
145
+
146
+ def __init__(
147
+ self,
148
+ vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
149
+ text_config: Optional[Union[dict, Step3TextConfig]] = None,
150
+ understand_projector_stride: int = 1,
151
+ projector_bias: bool = True,
152
+ image_token_id: int = 128001,
153
+ **kwargs,
154
+ ) -> None:
155
+ if vision_config is None:
156
+ vision_config = Step3VisionEncoderConfig()
157
+ elif isinstance(vision_config, dict):
158
+ vision_config = Step3VisionEncoderConfig(**vision_config)
159
+ self.vision_config = vision_config
160
+
161
+ if text_config is None:
162
+ text_config = Step3TextConfig()
163
+ elif isinstance(text_config, dict):
164
+ text_config = Step3TextConfig(**text_config)
165
+ self.text_config = text_config
166
+
167
+ self.understand_projector_stride = understand_projector_stride
168
+ self.projector_bias = projector_bias
169
+ self.hidden_size = text_config.hidden_size
170
+ self.image_token_id = image_token_id
171
+
172
+ super().__init__(**kwargs)
@@ -994,6 +994,23 @@ register_conv_template(
994
994
  )
995
995
  )
996
996
 
997
+ register_conv_template(
998
+ Conversation(
999
+ name="step3-vl",
1000
+ system_message="<|begin▁of▁sentence|>You are a helpful assistant",
1001
+ system_template="{system_message}\n",
1002
+ roles=(
1003
+ "<|BOT|>user\n",
1004
+ "<|BOT|>assistant\n<think>\n",
1005
+ ),
1006
+ sep="<|EOT|>",
1007
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1008
+ stop_str="<|EOT|>",
1009
+ image_token="<im_patch>",
1010
+ # add_bos=True,
1011
+ )
1012
+ )
1013
+
997
1014
 
998
1015
  @register_conv_template_matching_function
999
1016
  def match_internvl(model_path: str):
@@ -1103,3 +1120,9 @@ def match_vila(model_path: str):
1103
1120
  def match_mimo_vl(model_path: str):
1104
1121
  if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
1105
1122
  return "mimo-vl"
1123
+
1124
+
1125
+ # @register_conv_template_matching_function
1126
+ # def match_step3(model_path: str):
1127
+ # if re.search(r"step3", model_path, re.IGNORECASE):
1128
+ # return "step3-vl"
@@ -694,10 +694,7 @@ class SchedulerDisaggregationDecodeMixin:
694
694
  + len(self.disagg_decode_prealloc_queue.queue)
695
695
  == 0
696
696
  ):
697
- # When the server is idle, do self-check and re-init some states
698
- self.check_memory()
699
- self.new_token_ratio = self.init_new_token_ratio
700
- self.maybe_sleep_on_idle()
697
+ self.self_check_during_idle()
701
698
 
702
699
  self.last_batch = batch
703
700
 
@@ -771,10 +768,7 @@ class SchedulerDisaggregationDecodeMixin:
771
768
  + len(self.disagg_decode_prealloc_queue.queue)
772
769
  == 0
773
770
  ):
774
- # When the server is idle, do self-check and re-init some states
775
- self.check_memory()
776
- self.new_token_ratio = self.init_new_token_ratio
777
- self.maybe_sleep_on_idle()
771
+ self.self_check_during_idle()
778
772
 
779
773
  self.last_batch = batch
780
774
  self.last_batch_in_queue = last_batch_in_queue
@@ -287,9 +287,7 @@ class SchedulerDisaggregationPrefillMixin:
287
287
  self.process_disagg_prefill_inflight_queue()
288
288
 
289
289
  if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
290
- self.check_memory()
291
- self.new_token_ratio = self.init_new_token_ratio
292
- self.maybe_sleep_on_idle()
290
+ self.self_check_during_idle()
293
291
 
294
292
  self.last_batch = batch
295
293
  # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -337,9 +335,7 @@ class SchedulerDisaggregationPrefillMixin:
337
335
  self.process_disagg_prefill_inflight_queue()
338
336
 
339
337
  if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
340
- self.check_memory()
341
- self.new_token_ratio = self.init_new_token_ratio
342
- self.maybe_sleep_on_idle()
338
+ self.self_check_during_idle()
343
339
 
344
340
  self.last_batch = batch
345
341
  # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -354,6 +354,13 @@ class GroupCoordinator:
354
354
  self.cpu_group, 1 << 22, 6
355
355
  )
356
356
 
357
+ def __repr__(self):
358
+ return (
359
+ f"ranks={self.ranks} rank={self.rank} local_rank={self.local_rank} use_pynccl={self.use_pynccl} "
360
+ f"device_group={self.device_group} cpu_group={self.cpu_group} unique_name={self.unique_name} "
361
+ f"world_size={self.world_size} rank_in_group={self.rank_in_group}"
362
+ )
363
+
357
364
  @property
358
365
  def first_rank(self):
359
366
  """Return the global rank of the first process in the group"""
@@ -1141,6 +1148,20 @@ def get_tp_group() -> GroupCoordinator:
1141
1148
  return _TP
1142
1149
 
1143
1150
 
1151
+ _MOE_EP: Optional[GroupCoordinator] = None
1152
+ _MOE_TP: Optional[GroupCoordinator] = None
1153
+
1154
+
1155
+ def get_moe_ep_group() -> GroupCoordinator:
1156
+ assert _MOE_EP is not None, "expert model parallel group is not initialized"
1157
+ return _MOE_EP
1158
+
1159
+
1160
+ def get_moe_tp_group() -> GroupCoordinator:
1161
+ assert _MOE_TP is not None, "expert model parallel group is not initialized"
1162
+ return _MOE_TP
1163
+
1164
+
1144
1165
  # kept for backward compatibility
1145
1166
  get_tensor_model_parallel_group = get_tp_group
1146
1167
 
@@ -1250,6 +1271,7 @@ def init_distributed_environment(
1250
1271
 
1251
1272
  def initialize_model_parallel(
1252
1273
  tensor_model_parallel_size: int = 1,
1274
+ expert_model_parallel_size: int = 1,
1253
1275
  pipeline_model_parallel_size: int = 1,
1254
1276
  backend: Optional[str] = None,
1255
1277
  duplicate_tp_group: bool = False,
@@ -1327,6 +1349,45 @@ def initialize_model_parallel(
1327
1349
  _TP.pynccl_comm.disabled = False
1328
1350
  _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
1329
1351
 
1352
+ moe_ep_size = expert_model_parallel_size
1353
+
1354
+ moe_tp_size = tensor_model_parallel_size // moe_ep_size
1355
+ global _MOE_EP
1356
+ assert _MOE_EP is None, "expert model parallel group is already initialized"
1357
+ group_ranks = []
1358
+ for i in range(num_tensor_model_parallel_groups):
1359
+ for j in range(moe_tp_size):
1360
+ st = i * tensor_model_parallel_size + j
1361
+ en = (i + 1) * tensor_model_parallel_size + j
1362
+ ranks = list(range(st, en, moe_tp_size))
1363
+ group_ranks.append(ranks)
1364
+
1365
+ _MOE_EP = init_model_parallel_group(
1366
+ group_ranks,
1367
+ get_world_group().local_rank,
1368
+ backend,
1369
+ use_custom_allreduce=False,
1370
+ group_name="moe_ep",
1371
+ )
1372
+
1373
+ global _MOE_TP
1374
+ assert _MOE_TP is None, "expert model parallel group is already initialized"
1375
+ group_ranks = []
1376
+ for i in range(num_tensor_model_parallel_groups):
1377
+ for j in range(moe_ep_size):
1378
+ st = i * tensor_model_parallel_size + j * moe_tp_size
1379
+ en = i * tensor_model_parallel_size + (j + 1) * moe_tp_size
1380
+ ranks = list(range(st, en))
1381
+ group_ranks.append(ranks)
1382
+
1383
+ _MOE_TP = init_model_parallel_group(
1384
+ group_ranks,
1385
+ get_world_group().local_rank,
1386
+ backend,
1387
+ use_custom_allreduce=False,
1388
+ group_name="moe_tp",
1389
+ )
1390
+
1330
1391
  # Build the pipeline model-parallel groups.
1331
1392
  num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
1332
1393
  global _PP
@@ -1347,6 +1408,7 @@ def initialize_model_parallel(
1347
1408
 
1348
1409
  def ensure_model_parallel_initialized(
1349
1410
  tensor_model_parallel_size: int,
1411
+ expert_model_parallel_size: int,
1350
1412
  pipeline_model_parallel_size: int,
1351
1413
  backend: Optional[str] = None,
1352
1414
  ) -> None:
@@ -1357,7 +1419,10 @@ def ensure_model_parallel_initialized(
1357
1419
  backend = backend or torch.distributed.get_backend(get_world_group().device_group)
1358
1420
  if not model_parallel_is_initialized():
1359
1421
  initialize_model_parallel(
1360
- tensor_model_parallel_size, pipeline_model_parallel_size, backend
1422
+ tensor_model_parallel_size,
1423
+ expert_model_parallel_size,
1424
+ pipeline_model_parallel_size,
1425
+ backend,
1361
1426
  )
1362
1427
  return
1363
1428
 
@@ -1417,6 +1482,26 @@ def get_tensor_model_parallel_rank():
1417
1482
  return get_tp_group().rank_in_group
1418
1483
 
1419
1484
 
1485
+ def get_moe_expert_parallel_world_size():
1486
+ """Return world size for the moe expert parallel group."""
1487
+ return get_moe_ep_group().world_size
1488
+
1489
+
1490
+ def get_moe_expert_parallel_rank():
1491
+ """Return my rank for the moe expert parallel group."""
1492
+ return get_moe_ep_group().rank_in_group
1493
+
1494
+
1495
+ def get_moe_tensor_parallel_world_size():
1496
+ """Return world size for the moe tensor parallel group."""
1497
+ return get_moe_tp_group().world_size
1498
+
1499
+
1500
+ def get_moe_tensor_parallel_rank():
1501
+ """Return my rank for the moe tensor parallel group."""
1502
+ return get_moe_tp_group().rank_in_group
1503
+
1504
+
1420
1505
  def destroy_model_parallel():
1421
1506
  """Set the groups to none and destroy them."""
1422
1507
  global _TP
@@ -648,29 +648,23 @@ def _set_envs_and_config(server_args: ServerArgs):
648
648
  if _is_cuda:
649
649
  assert_pkg_version(
650
650
  "sgl-kernel",
651
- "0.2.7",
651
+ "0.2.8",
652
652
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
653
653
  )
654
654
 
655
- def sigchld_handler(signum, frame):
656
- pid, exitcode = os.waitpid(0, os.WNOHANG)
657
- if exitcode != 0:
658
- logger.warning(
659
- f"Child process unexpectedly failed with {exitcode=}. {pid=}"
655
+ if True: # Keep this check for internal code compatibility
656
+ # Register the signal handler.
657
+ # The child processes will send SIGQUIT to this process when any error happens
658
+ # This process then clean up the whole process tree
659
+ # Note: This sigquit handler is used in the launch phase, and may be replaced by
660
+ # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
661
+ def launch_phase_sigquit_handler(signum, frame):
662
+ logger.error(
663
+ "Received sigquit from a child process. It usually means the child failed."
660
664
  )
665
+ kill_process_tree(os.getpid())
661
666
 
662
- signal.signal(signal.SIGCHLD, sigchld_handler)
663
-
664
- # Register the signal handler.
665
- # The child processes will send SIGQUIT to this process when any error happens
666
- # This process then clean up the whole process tree
667
- def sigquit_handler(signum, frame):
668
- logger.error(
669
- "Received sigquit from a child process. It usually means the child failed."
670
- )
671
- kill_process_tree(os.getpid())
672
-
673
- signal.signal(signal.SIGQUIT, sigquit_handler)
667
+ signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
674
668
 
675
669
  # Set mp start method
676
670
  mp.set_start_method("spawn", force=True)
@@ -725,6 +719,7 @@ def _launch_subprocesses(
725
719
  + ((pp_rank % pp_size_per_node) * tp_size_per_node)
726
720
  + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
727
721
  )
722
+ moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
728
723
  proc = mp.Process(
729
724
  target=run_scheduler_process,
730
725
  args=(
@@ -732,6 +727,7 @@ def _launch_subprocesses(
732
727
  port_args,
733
728
  gpu_id,
734
729
  tp_rank,
730
+ moe_ep_rank,
735
731
  pp_rank,
736
732
  None,
737
733
  writer,
@@ -238,6 +238,9 @@ async def health() -> Response:
238
238
  @app.get("/health_generate")
239
239
  async def health_generate(request: Request) -> Response:
240
240
  """Check the health of the inference server by generating one token."""
241
+ if _global_state.tokenizer_manager.gracefully_exit:
242
+ logger.info("Health check request received during shutdown. Returning 503.")
243
+ return Response(status_code=503)
241
244
 
242
245
  sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
243
246
  rid = f"HEALTH_CHECK_{time.time()}"
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
260
263
  async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
261
264
  break
262
265
 
263
- tic = time.perf_counter()
266
+ # This request is a special request.
267
+ # If the server already has something running, this request will be ignored, so it creates zero overhead.
268
+ # If the server is not running, this request will be run, so we know whether the server is healthy.
264
269
  task = asyncio.create_task(gen())
265
- while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
270
+
271
+ # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
272
+ tic = time.time()
273
+ while time.time() < tic + HEALTH_CHECK_TIMEOUT:
266
274
  await asyncio.sleep(1)
267
275
  if _global_state.tokenizer_manager.last_receive_tstamp > tic:
268
276
  task.cancel()
@@ -127,12 +127,12 @@ class OpenAIServingChat(OpenAIServingBase):
127
127
  request.skip_special_tokens = False
128
128
  if not isinstance(request.tool_choice, str):
129
129
  tools = [
130
- item.model_dump()
130
+ item.function.model_dump()
131
131
  for item in request.tools
132
132
  if item.function.name == request.tool_choice.function.name
133
133
  ]
134
134
  else:
135
- tools = [item.model_dump() for item in request.tools]
135
+ tools = [item.function.model_dump() for item in request.tools]
136
136
 
137
137
  tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
138
138
  parser = FunctionCallParser(request.tools, tool_call_parser)
@@ -178,25 +178,6 @@ class OpenAIServingChat(OpenAIServingBase):
178
178
  audio_data,
179
179
  modalities,
180
180
  )
181
-
182
- if "tool_calls" in processed_msg and isinstance(
183
- processed_msg.get("tool_calls"), list
184
- ):
185
- for call in processed_msg["tool_calls"]:
186
- try:
187
- if "arguments" in call["function"] and isinstance(
188
- call["function"]["arguments"], str
189
- ):
190
- call["function"]["arguments"] = json.loads(
191
- call["function"]["arguments"]
192
- )
193
- except json.JSONDecodeError as e:
194
- # Log a warning or error if JSON parsing fails for arguments
195
- logger.warning(
196
- f"Failed to parse tool call arguments as JSON: {e}"
197
- )
198
- # Decide whether to continue or raise the exception based on desired behavior
199
- continue # Or raise e if strict parsing is required
200
181
  openai_compatible_messages.append(processed_msg)
201
182
 
202
183
  # Handle assistant prefix for continue_final_message
@@ -47,6 +47,11 @@ class ExpertDistributionRecorder(ABC):
47
47
  rank: int,
48
48
  ):
49
49
  if server_args.expert_distribution_recorder_mode is not None:
50
+ assert (
51
+ expert_location_metadata is not None
52
+ ), "ExpertLocationMetadata is required for expert distribution recording. One possible"
53
+ "reason is that you are using a model that does not support expert distribution"
54
+ "recording. Try setting `get_model_config_for_expert_location` in your model."
50
55
  return _ExpertDistributionRecorderReal(
51
56
  server_args, expert_location_metadata, rank
52
57
  )
@@ -82,6 +82,10 @@ class ExpertLocationMetadata:
82
82
  def init_trivial(server_args: ServerArgs, model_config: ModelConfig):
83
83
  """Trivial location - logical expert i corresponds to physical expert i"""
84
84
  common = ExpertLocationMetadata._init_common(server_args, model_config)
85
+
86
+ if common is None:
87
+ return None
88
+
85
89
  num_physical_experts = common["num_physical_experts"]
86
90
  model_config_for_expert_location = common["model_config_for_expert_location"]
87
91
  num_layers = model_config_for_expert_location.num_layers
@@ -109,6 +113,10 @@ class ExpertLocationMetadata:
109
113
  physical_to_logical_map = physical_to_logical_map.to(server_args.device)
110
114
 
111
115
  common = ExpertLocationMetadata._init_common(server_args, model_config)
116
+
117
+ if common is None:
118
+ return None
119
+
112
120
  model_config_for_expert_location = common["model_config_for_expert_location"]
113
121
  logical_to_all_physical_map = _compute_logical_to_all_physical_map(
114
122
  physical_to_logical_map,
@@ -133,6 +141,10 @@ class ExpertLocationMetadata:
133
141
  logical_count = logical_count.to(server_args.device)
134
142
 
135
143
  common = ExpertLocationMetadata._init_common(server_args, model_config)
144
+
145
+ if common is None:
146
+ return None
147
+
136
148
  model_config_for_expert_location = common["model_config_for_expert_location"]
137
149
  num_physical_experts = common["num_physical_experts"]
138
150
  num_groups = model_config_for_expert_location.num_groups
@@ -168,6 +180,9 @@ class ExpertLocationMetadata:
168
180
  ModelConfigForExpertLocation.from_model_config(model_config)
169
181
  )
170
182
 
183
+ if model_config_for_expert_location is None:
184
+ return None
185
+
171
186
  num_physical_experts = (
172
187
  model_config_for_expert_location.num_logical_experts
173
188
  + server_args.ep_num_redundant_experts
@@ -398,10 +413,6 @@ class ModelConfigForExpertLocation:
398
413
  num_logical_experts: int
399
414
  num_groups: Optional[int] = None
400
415
 
401
- @staticmethod
402
- def init_dummy():
403
- return ModelConfigForExpertLocation(num_layers=1, num_logical_experts=1)
404
-
405
416
  @staticmethod
406
417
  def from_model_config(model_config: ModelConfig):
407
418
  model_class, _ = get_model_architecture(model_config)
@@ -410,12 +421,12 @@ class ModelConfigForExpertLocation:
410
421
  model_config.hf_config
411
422
  )
412
423
  else:
413
- return ModelConfigForExpertLocation.init_dummy()
424
+ return None
414
425
 
415
426
 
416
427
  def compute_initial_expert_location_metadata(
417
428
  server_args: ServerArgs, model_config: ModelConfig
418
- ) -> ExpertLocationMetadata:
429
+ ) -> Optional[ExpertLocationMetadata]:
419
430
  data = server_args.init_expert_location
420
431
  if data == "trivial":
421
432
  return ExpertLocationMetadata.init_trivial(server_args, model_config)
@@ -36,6 +36,7 @@ class ExpertLocationDispatchInfo:
36
36
  def init_new(cls, layer_id: int):
37
37
  ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
38
38
  expert_location_metadata = get_global_expert_location_metadata()
39
+ assert expert_location_metadata is not None
39
40
 
40
41
  if ep_dispatch_algorithm is None:
41
42
  return None
@@ -50,6 +50,8 @@ class ExpertLocationUpdater:
50
50
  torch.cuda.empty_cache()
51
51
 
52
52
  old_expert_location_metadata = get_global_expert_location_metadata()
53
+ assert old_expert_location_metadata is not None
54
+
53
55
  _update_expert_weights(
54
56
  routed_experts_weights_of_layer=routed_experts_weights_of_layer,
55
57
  old_expert_location_metadata=old_expert_location_metadata,