sglang 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sglang/bench_offline_throughput.py +55 -2
  2. sglang/bench_one_batch.py +7 -6
  3. sglang/bench_one_batch_server.py +4 -3
  4. sglang/bench_serving.py +13 -0
  5. sglang/check_env.py +1 -1
  6. sglang/launch_server.py +3 -2
  7. sglang/srt/_custom_ops.py +118 -0
  8. sglang/srt/configs/device_config.py +17 -0
  9. sglang/srt/configs/load_config.py +84 -0
  10. sglang/srt/configs/model_config.py +161 -4
  11. sglang/srt/configs/qwen2vl.py +5 -8
  12. sglang/srt/constrained/outlines_backend.py +6 -1
  13. sglang/srt/constrained/outlines_jump_forward.py +8 -1
  14. sglang/srt/distributed/__init__.py +3 -0
  15. sglang/srt/distributed/communication_op.py +34 -0
  16. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  17. sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
  18. sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
  19. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
  20. sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
  21. sglang/srt/distributed/device_communicators/pynccl.py +204 -0
  22. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
  23. sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
  24. sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
  25. sglang/srt/distributed/parallel_state.py +1275 -0
  26. sglang/srt/distributed/utils.py +223 -0
  27. sglang/srt/hf_transformers_utils.py +37 -1
  28. sglang/srt/layers/attention/flashinfer_backend.py +13 -15
  29. sglang/srt/layers/attention/torch_native_backend.py +285 -0
  30. sglang/srt/layers/fused_moe_patch.py +20 -11
  31. sglang/srt/layers/linear.py +1 -0
  32. sglang/srt/layers/logits_processor.py +17 -3
  33. sglang/srt/layers/quantization/__init__.py +34 -0
  34. sglang/srt/layers/vocab_parallel_embedding.py +1 -0
  35. sglang/srt/lora/lora.py +1 -1
  36. sglang/srt/managers/data_parallel_controller.py +7 -11
  37. sglang/srt/managers/detokenizer_manager.py +7 -4
  38. sglang/srt/managers/image_processor.py +1 -1
  39. sglang/srt/managers/io_struct.py +48 -12
  40. sglang/srt/managers/schedule_batch.py +42 -36
  41. sglang/srt/managers/schedule_policy.py +7 -4
  42. sglang/srt/managers/scheduler.py +111 -46
  43. sglang/srt/managers/session_controller.py +0 -3
  44. sglang/srt/managers/tokenizer_manager.py +169 -100
  45. sglang/srt/managers/tp_worker.py +36 -3
  46. sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
  47. sglang/srt/model_executor/cuda_graph_runner.py +16 -7
  48. sglang/srt/model_executor/forward_batch_info.py +9 -4
  49. sglang/srt/model_executor/model_runner.py +136 -150
  50. sglang/srt/model_loader/__init__.py +34 -0
  51. sglang/srt/model_loader/loader.py +1139 -0
  52. sglang/srt/model_loader/utils.py +41 -0
  53. sglang/srt/model_loader/weight_utils.py +640 -0
  54. sglang/srt/models/baichuan.py +9 -10
  55. sglang/srt/models/chatglm.py +6 -15
  56. sglang/srt/models/commandr.py +2 -3
  57. sglang/srt/models/dbrx.py +2 -3
  58. sglang/srt/models/deepseek.py +4 -11
  59. sglang/srt/models/deepseek_v2.py +3 -11
  60. sglang/srt/models/exaone.py +2 -3
  61. sglang/srt/models/gemma.py +2 -6
  62. sglang/srt/models/gemma2.py +3 -14
  63. sglang/srt/models/gemma2_reward.py +0 -1
  64. sglang/srt/models/gpt2.py +5 -12
  65. sglang/srt/models/gpt_bigcode.py +6 -22
  66. sglang/srt/models/grok.py +14 -51
  67. sglang/srt/models/internlm2.py +2 -3
  68. sglang/srt/models/internlm2_reward.py +0 -1
  69. sglang/srt/models/llama.py +97 -27
  70. sglang/srt/models/llama_classification.py +1 -2
  71. sglang/srt/models/llama_embedding.py +1 -2
  72. sglang/srt/models/llama_reward.py +2 -3
  73. sglang/srt/models/llava.py +10 -12
  74. sglang/srt/models/llavavid.py +1 -2
  75. sglang/srt/models/minicpm.py +4 -7
  76. sglang/srt/models/minicpm3.py +6 -19
  77. sglang/srt/models/mixtral.py +12 -5
  78. sglang/srt/models/mixtral_quant.py +2 -3
  79. sglang/srt/models/mllama.py +3 -7
  80. sglang/srt/models/olmo.py +2 -8
  81. sglang/srt/models/olmo2.py +391 -0
  82. sglang/srt/models/olmoe.py +3 -5
  83. sglang/srt/models/phi3_small.py +8 -8
  84. sglang/srt/models/qwen.py +2 -3
  85. sglang/srt/models/qwen2.py +10 -9
  86. sglang/srt/models/qwen2_moe.py +4 -11
  87. sglang/srt/models/qwen2_vl.py +12 -9
  88. sglang/srt/models/registry.py +99 -0
  89. sglang/srt/models/stablelm.py +2 -3
  90. sglang/srt/models/torch_native_llama.py +6 -12
  91. sglang/srt/models/xverse.py +2 -4
  92. sglang/srt/models/xverse_moe.py +4 -11
  93. sglang/srt/models/yivl.py +2 -3
  94. sglang/srt/openai_api/adapter.py +10 -6
  95. sglang/srt/openai_api/protocol.py +1 -0
  96. sglang/srt/server.py +303 -204
  97. sglang/srt/server_args.py +65 -31
  98. sglang/srt/utils.py +253 -48
  99. sglang/test/test_utils.py +27 -7
  100. sglang/utils.py +2 -2
  101. sglang/version.py +1 -1
  102. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/METADATA +2 -1
  103. sglang-0.4.0.dist-info/RECORD +184 -0
  104. sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
  105. sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
  106. sglang/srt/layers/fused_moe_grok/layer.py +0 -630
  107. sglang-0.3.6.post2.dist-info/RECORD +0 -164
  108. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
  109. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
  110. {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,7 @@
15
15
 
16
16
  import logging
17
17
  import os
18
+ import signal
18
19
  import threading
19
20
  import time
20
21
  import warnings
@@ -23,6 +24,7 @@ from concurrent import futures
23
24
  from types import SimpleNamespace
24
25
  from typing import List, Optional
25
26
 
27
+ import psutil
26
28
  import torch
27
29
  import zmq
28
30
 
@@ -36,15 +38,19 @@ from sglang.srt.managers.io_struct import (
36
38
  BatchTokenIDOut,
37
39
  CloseSessionReqInput,
38
40
  FlushCacheReq,
39
- GetMemPoolSizeReq,
40
- GetMemPoolSizeReqOutput,
41
+ GetWeightsByNameReqInput,
42
+ GetWeightsByNameReqOutput,
43
+ InitWeightsUpdateGroupReqInput,
44
+ InitWeightsUpdateGroupReqOutput,
41
45
  OpenSessionReqInput,
42
46
  OpenSessionReqOutput,
43
47
  ProfileReq,
44
48
  TokenizedEmbeddingReqInput,
45
49
  TokenizedGenerateReqInput,
46
- UpdateWeightReqInput,
47
- UpdateWeightReqOutput,
50
+ UpdateWeightFromDiskReqInput,
51
+ UpdateWeightFromDiskReqOutput,
52
+ UpdateWeightsFromDistributedReqInput,
53
+ UpdateWeightsFromDistributedReqOutput,
48
54
  )
49
55
  from sglang.srt.managers.schedule_batch import (
50
56
  FINISH_ABORT,
@@ -73,7 +79,6 @@ from sglang.srt.utils import (
73
79
  crash_on_warnings,
74
80
  get_bool_env_var,
75
81
  get_zmq_socket,
76
- kill_parent_process,
77
82
  set_gpu_proc_affinity,
78
83
  set_random_seed,
79
84
  suppress_other_loggers,
@@ -142,9 +147,12 @@ class Scheduler:
142
147
  self.model_config = ModelConfig(
143
148
  server_args.model_path,
144
149
  trust_remote_code=server_args.trust_remote_code,
150
+ revision=server_args.revision,
145
151
  context_length=server_args.context_length,
146
152
  model_override_args=server_args.json_model_override_args,
147
153
  is_embedding=server_args.is_embedding,
154
+ dtype=server_args.dtype,
155
+ quantization=server_args.quantization,
148
156
  )
149
157
  self.is_generation = self.model_config.is_generation
150
158
 
@@ -170,6 +178,10 @@ class Scheduler:
170
178
  self.enable_overlap = False
171
179
  logger.info("Overlap scheduler is disabled for embedding models.")
172
180
 
181
+ if self.model_config.is_multimodal:
182
+ self.enable_overlap = False
183
+ logger.info("Overlap scheduler is disabled for multimodal models.")
184
+
173
185
  if self.enable_overlap:
174
186
  self.disable_jump_forward = True
175
187
 
@@ -250,6 +262,8 @@ class Scheduler:
250
262
 
251
263
  # Init chunked prefill
252
264
  self.chunked_prefill_size = server_args.chunked_prefill_size
265
+ if self.chunked_prefill_size <= 0: # -1 means disable
266
+ self.chunked_prefill_size = None
253
267
  self.being_chunked_req = None
254
268
  self.is_mixed_chunk = (
255
269
  self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
@@ -312,6 +326,7 @@ class Scheduler:
312
326
  self.watchdog_timeout = server_args.watchdog_timeout
313
327
  t = threading.Thread(target=self.watchdog_thread, daemon=True)
314
328
  t.start()
329
+ self.parent_process = psutil.Process().parent()
315
330
 
316
331
  # Init profiler
317
332
  if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
@@ -355,7 +370,7 @@ class Scheduler:
355
370
  self.watchdog_last_time = time.time()
356
371
  time.sleep(self.watchdog_timeout / 2)
357
372
 
358
- kill_parent_process()
373
+ self.parent_process.send_signal(signal.SIGQUIT)
359
374
 
360
375
  @torch.no_grad()
361
376
  def event_loop_normal(self):
@@ -500,11 +515,27 @@ class Scheduler:
500
515
  self.flush_cache()
501
516
  elif isinstance(recv_req, AbortReq):
502
517
  self.abort_request(recv_req)
503
- elif isinstance(recv_req, UpdateWeightReqInput):
504
- success, message = self.update_weights(recv_req)
518
+ elif isinstance(recv_req, UpdateWeightFromDiskReqInput):
519
+ success, message = self.update_weights_from_disk(recv_req)
520
+ self.send_to_tokenizer.send_pyobj(
521
+ UpdateWeightFromDiskReqOutput(success, message)
522
+ )
523
+ elif isinstance(recv_req, GetWeightsByNameReqInput):
524
+ parameter = self.get_weights_by_name(recv_req)
525
+ self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
526
+ elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
527
+ success, message = self.init_weights_update_group(recv_req)
505
528
  self.send_to_tokenizer.send_pyobj(
506
- UpdateWeightReqOutput(success, message)
529
+ InitWeightsUpdateGroupReqOutput(success, message)
507
530
  )
531
+ elif isinstance(recv_req, UpdateWeightsFromDistributedReqInput):
532
+ success, message = self.update_weights_from_distributed(recv_req)
533
+ self.send_to_tokenizer.send_pyobj(
534
+ UpdateWeightsFromDistributedReqOutput(success, message)
535
+ )
536
+ elif isinstance(recv_req, GetWeightsByNameReqInput):
537
+ parameter = self.get_weights_by_name(recv_req)
538
+ self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
508
539
  elif isinstance(recv_req, ProfileReq):
509
540
  if recv_req == ProfileReq.START_PROFILE:
510
541
  self.start_profile()
@@ -515,10 +546,6 @@ class Scheduler:
515
546
  self.send_to_tokenizer.send_pyobj(OpenSessionReqOutput(session_id))
516
547
  elif isinstance(recv_req, CloseSessionReqInput):
517
548
  self.close_session(recv_req)
518
- elif isinstance(recv_req, GetMemPoolSizeReq):
519
- self.send_to_tokenizer.send_pyobj(
520
- GetMemPoolSizeReqOutput(self.max_total_num_tokens)
521
- )
522
549
  else:
523
550
  raise ValueError(f"Invalid request: {recv_req}")
524
551
 
@@ -526,8 +553,9 @@ class Scheduler:
526
553
  self,
527
554
  recv_req: TokenizedGenerateReqInput,
528
555
  ):
556
+ # Create a new request
529
557
  if recv_req.session_id is None or recv_req.session_id not in self.sessions:
530
- # Create a new request
558
+
531
559
  if recv_req.input_embeds is not None:
532
560
  # Generate fake input_ids based on the length of input_embeds
533
561
  seq_length = len(recv_req.input_embeds)
@@ -558,25 +586,30 @@ class Scheduler:
558
586
  self.waiting_queue.append(req)
559
587
  return
560
588
 
561
- # Image inputs
589
+ # Handle image inputs
562
590
  if recv_req.image_inputs is not None:
563
- image_inputs = ImageInputs.from_dict(
564
- recv_req.image_inputs, self.model_config.vocab_size
565
- )
591
+ image_inputs = ImageInputs.from_dict(recv_req.image_inputs)
592
+ # Expand a single image token into multiple dummy tokens for receiving image embeddings
566
593
  req.origin_input_ids = self.pad_input_ids_func(
567
594
  req.origin_input_ids, image_inputs
568
595
  )
569
- req.extend_image_inputs(image_inputs, self.model_config.vocab_size)
596
+ req.extend_image_inputs(image_inputs)
570
597
 
571
- if len(req.origin_input_ids) > self.max_req_input_len:
572
- req.finished_reason = FINISH_ABORT(
573
- "Image request length is longer than the KV cache pool size or "
574
- "the max context length aborting because you cannot truncate the image embeds"
598
+ if len(req.origin_input_ids) >= self.max_req_input_len:
599
+ logger.error(
600
+ "Multimodal prompt is too long after expanding multimodal tokens. "
601
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}. "
575
602
  )
603
+ req.origin_input_ids = [0]
604
+ req.image_inputs = None
576
605
  req.sampling_params.max_new_tokens = 0
606
+ req.finished_reason = FINISH_ABORT(
607
+ "Multimodal prompt is too long. Check server logs for details."
608
+ )
577
609
  self.waiting_queue.append(req)
578
610
  return
579
611
 
612
+ # Copy more attributes
580
613
  req.return_logprob = recv_req.return_logprob
581
614
  req.top_logprobs_num = recv_req.top_logprobs_num
582
615
  req.stream = recv_req.stream
@@ -647,7 +680,7 @@ class Scheduler:
647
680
 
648
681
  self.waiting_queue.append(req)
649
682
 
650
- def log_prefill_stats(self, adder, can_run_list, running_bs, has_inflight):
683
+ def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
651
684
  if isinstance(self.tree_cache, RadixCache):
652
685
  self.tree_cache_metrics["total"] += (
653
686
  adder.log_input_tokens + adder.log_hit_tokens
@@ -671,14 +704,14 @@ class Scheduler:
671
704
  f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
672
705
  f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
673
706
  f"#running-req: {running_bs}, "
674
- f"#queue-req: {len(self.waiting_queue) + has_inflight}"
707
+ f"#queue-req: {len(self.waiting_queue) + has_being_chunked}"
675
708
  )
676
709
 
677
710
  if self.enable_metrics:
678
711
  self.stats.num_running_reqs = running_bs
679
712
  self.stats.num_used_tokens = num_used
680
713
  self.stats.token_usage = round(num_used / self.max_total_num_tokens, 2)
681
- self.stats.num_queue_reqs = len(self.waiting_queue) + has_inflight
714
+ self.stats.num_queue_reqs = len(self.waiting_queue) + has_being_chunked
682
715
  self.stats.cache_hit_rate = tree_cache_hit_rate
683
716
  self.metrics_collector.log_stats(self.stats)
684
717
 
@@ -739,7 +772,7 @@ class Scheduler:
739
772
  # Move the chunked request out of the batch
740
773
  self.last_batch.filter_batch(being_chunked_req=self.being_chunked_req)
741
774
  self.tree_cache.cache_unfinished_req(self.being_chunked_req)
742
- # Inflight request keeps its rid but will get a new req_pool_idx
775
+ # being chunked request keeps its rid but will get a new req_pool_idx
743
776
  self.req_to_token_pool.free(self.being_chunked_req.req_pool_idx)
744
777
  self.batch_is_full = False
745
778
 
@@ -790,10 +823,10 @@ class Scheduler:
790
823
  running_bs if self.is_mixed_chunk else 0,
791
824
  )
792
825
 
793
- has_inflight = self.being_chunked_req is not None
794
- if has_inflight:
826
+ has_being_chunked = self.being_chunked_req is not None
827
+ if has_being_chunked:
795
828
  self.being_chunked_req.init_next_round_input()
796
- self.being_chunked_req = adder.add_inflight_req(self.being_chunked_req)
829
+ self.being_chunked_req = adder.add_being_chunked_req(self.being_chunked_req)
797
830
 
798
831
  if self.lora_paths:
799
832
  lora_set = (
@@ -835,16 +868,16 @@ class Scheduler:
835
868
  x for x in self.waiting_queue if x not in set(can_run_list)
836
869
  ]
837
870
 
838
- if adder.new_inflight_req is not None:
871
+ if adder.new_being_chunked_req is not None:
839
872
  assert self.being_chunked_req is None
840
- self.being_chunked_req = adder.new_inflight_req
873
+ self.being_chunked_req = adder.new_being_chunked_req
841
874
 
842
875
  if self.being_chunked_req:
843
876
  self.being_chunked_req.is_being_chunked += 1
844
877
 
845
878
  # Print stats
846
879
  if self.tp_rank == 0:
847
- self.log_prefill_stats(adder, can_run_list, running_bs, has_inflight)
880
+ self.log_prefill_stats(adder, can_run_list, running_bs, has_being_chunked)
848
881
 
849
882
  # Create a new batch
850
883
  new_batch = ScheduleBatch.init_new(
@@ -1017,7 +1050,7 @@ class Scheduler:
1017
1050
  if req.grammar is not None:
1018
1051
  req.grammar.accept_token(next_token_id)
1019
1052
  else:
1020
- # Inflight reqs' prefill is not finished
1053
+ # being chunked reqs' prefill is not finished
1021
1054
  req.is_being_chunked -= 1
1022
1055
 
1023
1056
  if batch.next_batch_sampling_info:
@@ -1045,7 +1078,7 @@ class Scheduler:
1045
1078
  else:
1046
1079
  self.tree_cache.cache_unfinished_req(req)
1047
1080
  else:
1048
- # Inflight reqs' prefill is not finished
1081
+ # being chunked reqs' prefill is not finished
1049
1082
  req.is_being_chunked -= 1
1050
1083
 
1051
1084
  self.stream_output(batch.reqs)
@@ -1140,6 +1173,14 @@ class Scheduler:
1140
1173
  + 1 : len(req.fill_ids)
1141
1174
  - req.last_update_decode_tokens
1142
1175
  ]
1176
+
1177
+ # Clip the padded hash values from image tokens.
1178
+ # Otherwise, it will lead to detokenization errors.
1179
+ input_token_ids = [
1180
+ x if x < self.model_config.vocab_size - 1 else 0
1181
+ for x in input_token_ids
1182
+ ]
1183
+
1143
1184
  req.input_token_logprobs = list(zip(input_token_logprobs, input_token_ids))
1144
1185
 
1145
1186
  if (
@@ -1344,18 +1385,20 @@ class Scheduler:
1344
1385
 
1345
1386
  if to_del is not None:
1346
1387
  del self.waiting_queue[to_del]
1388
+ logger.debug(f"Abort queued request. {req.rid=}")
1389
+ return
1347
1390
 
1348
1391
  # Delete requests in the running batch
1349
1392
  if self.running_batch:
1350
1393
  for req in self.running_batch.reqs:
1351
1394
  if req.rid == recv_req.rid and not req.finished():
1352
- req.finished_reason = FINISH_ABORT()
1353
- self.tree_cache.cache_finished_req(req)
1395
+ logger.debug(f"Abort running request. {req.rid=}")
1396
+ req.to_abort = True
1354
1397
  break
1355
1398
 
1356
- def update_weights(self, recv_req: UpdateWeightReqInput):
1357
- """In-place update of the weights."""
1358
- success, message = self.tp_worker.update_weights(recv_req)
1399
+ def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
1400
+ """In-place update of the weights from disk."""
1401
+ success, message = self.tp_worker.update_weights_from_disk(recv_req)
1359
1402
  if success:
1360
1403
  flash_cache_success = self.flush_cache()
1361
1404
  assert flash_cache_success, "Cache flush failed after updating weights"
@@ -1363,6 +1406,27 @@ class Scheduler:
1363
1406
  logger.error(message)
1364
1407
  return success, message
1365
1408
 
1409
+ def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
1410
+ """Initialize the online model parameter update group."""
1411
+ success, message = self.tp_worker.init_weights_update_group(recv_req)
1412
+ return success, message
1413
+
1414
+ def update_weights_from_distributed(
1415
+ self, recv_req: UpdateWeightsFromDistributedReqInput
1416
+ ):
1417
+ """Update the online model parameter."""
1418
+ success, message = self.tp_worker.update_weights_from_distributed(recv_req)
1419
+ if success:
1420
+ flash_cache_success = self.flush_cache()
1421
+ assert flash_cache_success, "Cache flush failed after updating weights"
1422
+ else:
1423
+ logger.error(message)
1424
+ return success, message
1425
+
1426
+ def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
1427
+ parameter = self.tp_worker.get_weights_by_name(recv_req)
1428
+ return parameter
1429
+
1366
1430
  def start_profile(self) -> None:
1367
1431
  if self.profiler is None:
1368
1432
  raise RuntimeError("Profiler is not enabled.")
@@ -1409,9 +1473,9 @@ def run_scheduler_process(
1409
1473
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
1410
1474
  set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
1411
1475
 
1412
- # [For Router] if env var "DP_RANK" exist, set dp_rank to the value of the env var
1413
- if dp_rank is None and "DP_RANK" in os.environ:
1414
- dp_rank = int(os.environ["DP_RANK"])
1476
+ # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
1477
+ if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
1478
+ dp_rank = int(os.environ["SGLANG_DP_RANK"])
1415
1479
 
1416
1480
  if dp_rank is None:
1417
1481
  configure_logger(server_args, prefix=f" TP{tp_rank}")
@@ -1419,6 +1483,7 @@ def run_scheduler_process(
1419
1483
  configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
1420
1484
 
1421
1485
  suppress_other_loggers()
1486
+ parent_process = psutil.Process().parent()
1422
1487
 
1423
1488
  try:
1424
1489
  scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
@@ -1430,6 +1495,6 @@ def run_scheduler_process(
1430
1495
  else:
1431
1496
  scheduler.event_loop_normal()
1432
1497
  except Exception:
1433
- msg = get_exception_traceback()
1434
- logger.error(msg)
1435
- kill_parent_process()
1498
+ traceback = get_exception_traceback()
1499
+ logger.error(f"Scheduler hit an exception: {traceback}")
1500
+ parent_process.send_signal(signal.SIGQUIT)
@@ -10,10 +10,7 @@
10
10
  # limitations under the License.
11
11
  # ==============================================================================
12
12
 
13
- import copy
14
13
  import uuid
15
- from dataclasses import dataclass
16
- from typing import Optional
17
14
 
18
15
  from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
19
16
  from sglang.srt.managers.schedule_batch import FINISH_ABORT, List, Req