sglang 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +55 -2
- sglang/bench_one_batch.py +7 -6
- sglang/bench_one_batch_server.py +4 -3
- sglang/bench_serving.py +13 -0
- sglang/check_env.py +1 -1
- sglang/launch_server.py +3 -2
- sglang/srt/_custom_ops.py +118 -0
- sglang/srt/configs/device_config.py +17 -0
- sglang/srt/configs/load_config.py +84 -0
- sglang/srt/configs/model_config.py +161 -4
- sglang/srt/configs/qwen2vl.py +5 -8
- sglang/srt/constrained/outlines_backend.py +6 -1
- sglang/srt/constrained/outlines_jump_forward.py +8 -1
- sglang/srt/distributed/__init__.py +3 -0
- sglang/srt/distributed/communication_op.py +34 -0
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
- sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
- sglang/srt/distributed/device_communicators/pynccl.py +204 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
- sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
- sglang/srt/distributed/parallel_state.py +1275 -0
- sglang/srt/distributed/utils.py +223 -0
- sglang/srt/hf_transformers_utils.py +37 -1
- sglang/srt/layers/attention/flashinfer_backend.py +13 -15
- sglang/srt/layers/attention/torch_native_backend.py +285 -0
- sglang/srt/layers/fused_moe_patch.py +20 -11
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +17 -3
- sglang/srt/layers/quantization/__init__.py +34 -0
- sglang/srt/layers/vocab_parallel_embedding.py +1 -0
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +7 -11
- sglang/srt/managers/detokenizer_manager.py +7 -4
- sglang/srt/managers/image_processor.py +1 -1
- sglang/srt/managers/io_struct.py +48 -12
- sglang/srt/managers/schedule_batch.py +42 -36
- sglang/srt/managers/schedule_policy.py +7 -4
- sglang/srt/managers/scheduler.py +111 -46
- sglang/srt/managers/session_controller.py +0 -3
- sglang/srt/managers/tokenizer_manager.py +169 -100
- sglang/srt/managers/tp_worker.py +36 -3
- sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
- sglang/srt/model_executor/cuda_graph_runner.py +16 -7
- sglang/srt/model_executor/forward_batch_info.py +9 -4
- sglang/srt/model_executor/model_runner.py +136 -150
- sglang/srt/model_loader/__init__.py +34 -0
- sglang/srt/model_loader/loader.py +1139 -0
- sglang/srt/model_loader/utils.py +41 -0
- sglang/srt/model_loader/weight_utils.py +640 -0
- sglang/srt/models/baichuan.py +9 -10
- sglang/srt/models/chatglm.py +6 -15
- sglang/srt/models/commandr.py +2 -3
- sglang/srt/models/dbrx.py +2 -3
- sglang/srt/models/deepseek.py +4 -11
- sglang/srt/models/deepseek_v2.py +3 -11
- sglang/srt/models/exaone.py +2 -3
- sglang/srt/models/gemma.py +2 -6
- sglang/srt/models/gemma2.py +3 -14
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/gpt2.py +5 -12
- sglang/srt/models/gpt_bigcode.py +6 -22
- sglang/srt/models/grok.py +14 -51
- sglang/srt/models/internlm2.py +2 -3
- sglang/srt/models/internlm2_reward.py +0 -1
- sglang/srt/models/llama.py +97 -27
- sglang/srt/models/llama_classification.py +1 -2
- sglang/srt/models/llama_embedding.py +1 -2
- sglang/srt/models/llama_reward.py +2 -3
- sglang/srt/models/llava.py +10 -12
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +4 -7
- sglang/srt/models/minicpm3.py +6 -19
- sglang/srt/models/mixtral.py +12 -5
- sglang/srt/models/mixtral_quant.py +2 -3
- sglang/srt/models/mllama.py +3 -7
- sglang/srt/models/olmo.py +2 -8
- sglang/srt/models/olmo2.py +391 -0
- sglang/srt/models/olmoe.py +3 -5
- sglang/srt/models/phi3_small.py +8 -8
- sglang/srt/models/qwen.py +2 -3
- sglang/srt/models/qwen2.py +10 -9
- sglang/srt/models/qwen2_moe.py +4 -11
- sglang/srt/models/qwen2_vl.py +12 -9
- sglang/srt/models/registry.py +99 -0
- sglang/srt/models/stablelm.py +2 -3
- sglang/srt/models/torch_native_llama.py +6 -12
- sglang/srt/models/xverse.py +2 -4
- sglang/srt/models/xverse_moe.py +4 -11
- sglang/srt/models/yivl.py +2 -3
- sglang/srt/openai_api/adapter.py +10 -6
- sglang/srt/openai_api/protocol.py +1 -0
- sglang/srt/server.py +303 -204
- sglang/srt/server_args.py +65 -31
- sglang/srt/utils.py +253 -48
- sglang/test/test_utils.py +27 -7
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/METADATA +2 -1
- sglang-0.4.0.dist-info/RECORD +184 -0
- sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
- sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
- sglang/srt/layers/fused_moe_grok/layer.py +0 -630
- sglang-0.3.6.post2.dist-info/RECORD +0 -164
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0
sglang/srt/managers/scheduler.py
CHANGED
@@ -15,6 +15,7 @@
|
|
15
15
|
|
16
16
|
import logging
|
17
17
|
import os
|
18
|
+
import signal
|
18
19
|
import threading
|
19
20
|
import time
|
20
21
|
import warnings
|
@@ -23,6 +24,7 @@ from concurrent import futures
|
|
23
24
|
from types import SimpleNamespace
|
24
25
|
from typing import List, Optional
|
25
26
|
|
27
|
+
import psutil
|
26
28
|
import torch
|
27
29
|
import zmq
|
28
30
|
|
@@ -36,15 +38,19 @@ from sglang.srt.managers.io_struct import (
|
|
36
38
|
BatchTokenIDOut,
|
37
39
|
CloseSessionReqInput,
|
38
40
|
FlushCacheReq,
|
39
|
-
|
40
|
-
|
41
|
+
GetWeightsByNameReqInput,
|
42
|
+
GetWeightsByNameReqOutput,
|
43
|
+
InitWeightsUpdateGroupReqInput,
|
44
|
+
InitWeightsUpdateGroupReqOutput,
|
41
45
|
OpenSessionReqInput,
|
42
46
|
OpenSessionReqOutput,
|
43
47
|
ProfileReq,
|
44
48
|
TokenizedEmbeddingReqInput,
|
45
49
|
TokenizedGenerateReqInput,
|
46
|
-
|
47
|
-
|
50
|
+
UpdateWeightFromDiskReqInput,
|
51
|
+
UpdateWeightFromDiskReqOutput,
|
52
|
+
UpdateWeightsFromDistributedReqInput,
|
53
|
+
UpdateWeightsFromDistributedReqOutput,
|
48
54
|
)
|
49
55
|
from sglang.srt.managers.schedule_batch import (
|
50
56
|
FINISH_ABORT,
|
@@ -73,7 +79,6 @@ from sglang.srt.utils import (
|
|
73
79
|
crash_on_warnings,
|
74
80
|
get_bool_env_var,
|
75
81
|
get_zmq_socket,
|
76
|
-
kill_parent_process,
|
77
82
|
set_gpu_proc_affinity,
|
78
83
|
set_random_seed,
|
79
84
|
suppress_other_loggers,
|
@@ -142,9 +147,12 @@ class Scheduler:
|
|
142
147
|
self.model_config = ModelConfig(
|
143
148
|
server_args.model_path,
|
144
149
|
trust_remote_code=server_args.trust_remote_code,
|
150
|
+
revision=server_args.revision,
|
145
151
|
context_length=server_args.context_length,
|
146
152
|
model_override_args=server_args.json_model_override_args,
|
147
153
|
is_embedding=server_args.is_embedding,
|
154
|
+
dtype=server_args.dtype,
|
155
|
+
quantization=server_args.quantization,
|
148
156
|
)
|
149
157
|
self.is_generation = self.model_config.is_generation
|
150
158
|
|
@@ -170,6 +178,10 @@ class Scheduler:
|
|
170
178
|
self.enable_overlap = False
|
171
179
|
logger.info("Overlap scheduler is disabled for embedding models.")
|
172
180
|
|
181
|
+
if self.model_config.is_multimodal:
|
182
|
+
self.enable_overlap = False
|
183
|
+
logger.info("Overlap scheduler is disabled for multimodal models.")
|
184
|
+
|
173
185
|
if self.enable_overlap:
|
174
186
|
self.disable_jump_forward = True
|
175
187
|
|
@@ -250,6 +262,8 @@ class Scheduler:
|
|
250
262
|
|
251
263
|
# Init chunked prefill
|
252
264
|
self.chunked_prefill_size = server_args.chunked_prefill_size
|
265
|
+
if self.chunked_prefill_size <= 0: # -1 means disable
|
266
|
+
self.chunked_prefill_size = None
|
253
267
|
self.being_chunked_req = None
|
254
268
|
self.is_mixed_chunk = (
|
255
269
|
self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
|
@@ -312,6 +326,7 @@ class Scheduler:
|
|
312
326
|
self.watchdog_timeout = server_args.watchdog_timeout
|
313
327
|
t = threading.Thread(target=self.watchdog_thread, daemon=True)
|
314
328
|
t.start()
|
329
|
+
self.parent_process = psutil.Process().parent()
|
315
330
|
|
316
331
|
# Init profiler
|
317
332
|
if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
|
@@ -355,7 +370,7 @@ class Scheduler:
|
|
355
370
|
self.watchdog_last_time = time.time()
|
356
371
|
time.sleep(self.watchdog_timeout / 2)
|
357
372
|
|
358
|
-
|
373
|
+
self.parent_process.send_signal(signal.SIGQUIT)
|
359
374
|
|
360
375
|
@torch.no_grad()
|
361
376
|
def event_loop_normal(self):
|
@@ -500,11 +515,27 @@ class Scheduler:
|
|
500
515
|
self.flush_cache()
|
501
516
|
elif isinstance(recv_req, AbortReq):
|
502
517
|
self.abort_request(recv_req)
|
503
|
-
elif isinstance(recv_req,
|
504
|
-
success, message = self.
|
518
|
+
elif isinstance(recv_req, UpdateWeightFromDiskReqInput):
|
519
|
+
success, message = self.update_weights_from_disk(recv_req)
|
520
|
+
self.send_to_tokenizer.send_pyobj(
|
521
|
+
UpdateWeightFromDiskReqOutput(success, message)
|
522
|
+
)
|
523
|
+
elif isinstance(recv_req, GetWeightsByNameReqInput):
|
524
|
+
parameter = self.get_weights_by_name(recv_req)
|
525
|
+
self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
|
526
|
+
elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
|
527
|
+
success, message = self.init_weights_update_group(recv_req)
|
505
528
|
self.send_to_tokenizer.send_pyobj(
|
506
|
-
|
529
|
+
InitWeightsUpdateGroupReqOutput(success, message)
|
507
530
|
)
|
531
|
+
elif isinstance(recv_req, UpdateWeightsFromDistributedReqInput):
|
532
|
+
success, message = self.update_weights_from_distributed(recv_req)
|
533
|
+
self.send_to_tokenizer.send_pyobj(
|
534
|
+
UpdateWeightsFromDistributedReqOutput(success, message)
|
535
|
+
)
|
536
|
+
elif isinstance(recv_req, GetWeightsByNameReqInput):
|
537
|
+
parameter = self.get_weights_by_name(recv_req)
|
538
|
+
self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
|
508
539
|
elif isinstance(recv_req, ProfileReq):
|
509
540
|
if recv_req == ProfileReq.START_PROFILE:
|
510
541
|
self.start_profile()
|
@@ -515,10 +546,6 @@ class Scheduler:
|
|
515
546
|
self.send_to_tokenizer.send_pyobj(OpenSessionReqOutput(session_id))
|
516
547
|
elif isinstance(recv_req, CloseSessionReqInput):
|
517
548
|
self.close_session(recv_req)
|
518
|
-
elif isinstance(recv_req, GetMemPoolSizeReq):
|
519
|
-
self.send_to_tokenizer.send_pyobj(
|
520
|
-
GetMemPoolSizeReqOutput(self.max_total_num_tokens)
|
521
|
-
)
|
522
549
|
else:
|
523
550
|
raise ValueError(f"Invalid request: {recv_req}")
|
524
551
|
|
@@ -526,8 +553,9 @@ class Scheduler:
|
|
526
553
|
self,
|
527
554
|
recv_req: TokenizedGenerateReqInput,
|
528
555
|
):
|
556
|
+
# Create a new request
|
529
557
|
if recv_req.session_id is None or recv_req.session_id not in self.sessions:
|
530
|
-
|
558
|
+
|
531
559
|
if recv_req.input_embeds is not None:
|
532
560
|
# Generate fake input_ids based on the length of input_embeds
|
533
561
|
seq_length = len(recv_req.input_embeds)
|
@@ -558,25 +586,30 @@ class Scheduler:
|
|
558
586
|
self.waiting_queue.append(req)
|
559
587
|
return
|
560
588
|
|
561
|
-
#
|
589
|
+
# Handle image inputs
|
562
590
|
if recv_req.image_inputs is not None:
|
563
|
-
image_inputs = ImageInputs.from_dict(
|
564
|
-
|
565
|
-
)
|
591
|
+
image_inputs = ImageInputs.from_dict(recv_req.image_inputs)
|
592
|
+
# Expand a single image token into multiple dummy tokens for receiving image embeddings
|
566
593
|
req.origin_input_ids = self.pad_input_ids_func(
|
567
594
|
req.origin_input_ids, image_inputs
|
568
595
|
)
|
569
|
-
req.extend_image_inputs(image_inputs
|
596
|
+
req.extend_image_inputs(image_inputs)
|
570
597
|
|
571
|
-
if len(req.origin_input_ids)
|
572
|
-
|
573
|
-
"
|
574
|
-
"
|
598
|
+
if len(req.origin_input_ids) >= self.max_req_input_len:
|
599
|
+
logger.error(
|
600
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
601
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}. "
|
575
602
|
)
|
603
|
+
req.origin_input_ids = [0]
|
604
|
+
req.image_inputs = None
|
576
605
|
req.sampling_params.max_new_tokens = 0
|
606
|
+
req.finished_reason = FINISH_ABORT(
|
607
|
+
"Multimodal prompt is too long. Check server logs for details."
|
608
|
+
)
|
577
609
|
self.waiting_queue.append(req)
|
578
610
|
return
|
579
611
|
|
612
|
+
# Copy more attributes
|
580
613
|
req.return_logprob = recv_req.return_logprob
|
581
614
|
req.top_logprobs_num = recv_req.top_logprobs_num
|
582
615
|
req.stream = recv_req.stream
|
@@ -647,7 +680,7 @@ class Scheduler:
|
|
647
680
|
|
648
681
|
self.waiting_queue.append(req)
|
649
682
|
|
650
|
-
def log_prefill_stats(self, adder, can_run_list, running_bs,
|
683
|
+
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
|
651
684
|
if isinstance(self.tree_cache, RadixCache):
|
652
685
|
self.tree_cache_metrics["total"] += (
|
653
686
|
adder.log_input_tokens + adder.log_hit_tokens
|
@@ -671,14 +704,14 @@ class Scheduler:
|
|
671
704
|
f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
|
672
705
|
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
673
706
|
f"#running-req: {running_bs}, "
|
674
|
-
f"#queue-req: {len(self.waiting_queue) +
|
707
|
+
f"#queue-req: {len(self.waiting_queue) + has_being_chunked}"
|
675
708
|
)
|
676
709
|
|
677
710
|
if self.enable_metrics:
|
678
711
|
self.stats.num_running_reqs = running_bs
|
679
712
|
self.stats.num_used_tokens = num_used
|
680
713
|
self.stats.token_usage = round(num_used / self.max_total_num_tokens, 2)
|
681
|
-
self.stats.num_queue_reqs = len(self.waiting_queue) +
|
714
|
+
self.stats.num_queue_reqs = len(self.waiting_queue) + has_being_chunked
|
682
715
|
self.stats.cache_hit_rate = tree_cache_hit_rate
|
683
716
|
self.metrics_collector.log_stats(self.stats)
|
684
717
|
|
@@ -739,7 +772,7 @@ class Scheduler:
|
|
739
772
|
# Move the chunked request out of the batch
|
740
773
|
self.last_batch.filter_batch(being_chunked_req=self.being_chunked_req)
|
741
774
|
self.tree_cache.cache_unfinished_req(self.being_chunked_req)
|
742
|
-
#
|
775
|
+
# being chunked request keeps its rid but will get a new req_pool_idx
|
743
776
|
self.req_to_token_pool.free(self.being_chunked_req.req_pool_idx)
|
744
777
|
self.batch_is_full = False
|
745
778
|
|
@@ -790,10 +823,10 @@ class Scheduler:
|
|
790
823
|
running_bs if self.is_mixed_chunk else 0,
|
791
824
|
)
|
792
825
|
|
793
|
-
|
794
|
-
if
|
826
|
+
has_being_chunked = self.being_chunked_req is not None
|
827
|
+
if has_being_chunked:
|
795
828
|
self.being_chunked_req.init_next_round_input()
|
796
|
-
self.being_chunked_req = adder.
|
829
|
+
self.being_chunked_req = adder.add_being_chunked_req(self.being_chunked_req)
|
797
830
|
|
798
831
|
if self.lora_paths:
|
799
832
|
lora_set = (
|
@@ -835,16 +868,16 @@ class Scheduler:
|
|
835
868
|
x for x in self.waiting_queue if x not in set(can_run_list)
|
836
869
|
]
|
837
870
|
|
838
|
-
if adder.
|
871
|
+
if adder.new_being_chunked_req is not None:
|
839
872
|
assert self.being_chunked_req is None
|
840
|
-
self.being_chunked_req = adder.
|
873
|
+
self.being_chunked_req = adder.new_being_chunked_req
|
841
874
|
|
842
875
|
if self.being_chunked_req:
|
843
876
|
self.being_chunked_req.is_being_chunked += 1
|
844
877
|
|
845
878
|
# Print stats
|
846
879
|
if self.tp_rank == 0:
|
847
|
-
self.log_prefill_stats(adder, can_run_list, running_bs,
|
880
|
+
self.log_prefill_stats(adder, can_run_list, running_bs, has_being_chunked)
|
848
881
|
|
849
882
|
# Create a new batch
|
850
883
|
new_batch = ScheduleBatch.init_new(
|
@@ -1017,7 +1050,7 @@ class Scheduler:
|
|
1017
1050
|
if req.grammar is not None:
|
1018
1051
|
req.grammar.accept_token(next_token_id)
|
1019
1052
|
else:
|
1020
|
-
#
|
1053
|
+
# being chunked reqs' prefill is not finished
|
1021
1054
|
req.is_being_chunked -= 1
|
1022
1055
|
|
1023
1056
|
if batch.next_batch_sampling_info:
|
@@ -1045,7 +1078,7 @@ class Scheduler:
|
|
1045
1078
|
else:
|
1046
1079
|
self.tree_cache.cache_unfinished_req(req)
|
1047
1080
|
else:
|
1048
|
-
#
|
1081
|
+
# being chunked reqs' prefill is not finished
|
1049
1082
|
req.is_being_chunked -= 1
|
1050
1083
|
|
1051
1084
|
self.stream_output(batch.reqs)
|
@@ -1140,6 +1173,14 @@ class Scheduler:
|
|
1140
1173
|
+ 1 : len(req.fill_ids)
|
1141
1174
|
- req.last_update_decode_tokens
|
1142
1175
|
]
|
1176
|
+
|
1177
|
+
# Clip the padded hash values from image tokens.
|
1178
|
+
# Otherwise, it will lead to detokenization errors.
|
1179
|
+
input_token_ids = [
|
1180
|
+
x if x < self.model_config.vocab_size - 1 else 0
|
1181
|
+
for x in input_token_ids
|
1182
|
+
]
|
1183
|
+
|
1143
1184
|
req.input_token_logprobs = list(zip(input_token_logprobs, input_token_ids))
|
1144
1185
|
|
1145
1186
|
if (
|
@@ -1344,18 +1385,20 @@ class Scheduler:
|
|
1344
1385
|
|
1345
1386
|
if to_del is not None:
|
1346
1387
|
del self.waiting_queue[to_del]
|
1388
|
+
logger.debug(f"Abort queued request. {req.rid=}")
|
1389
|
+
return
|
1347
1390
|
|
1348
1391
|
# Delete requests in the running batch
|
1349
1392
|
if self.running_batch:
|
1350
1393
|
for req in self.running_batch.reqs:
|
1351
1394
|
if req.rid == recv_req.rid and not req.finished():
|
1352
|
-
req.
|
1353
|
-
|
1395
|
+
logger.debug(f"Abort running request. {req.rid=}")
|
1396
|
+
req.to_abort = True
|
1354
1397
|
break
|
1355
1398
|
|
1356
|
-
def
|
1357
|
-
"""In-place update of the weights."""
|
1358
|
-
success, message = self.tp_worker.
|
1399
|
+
def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
|
1400
|
+
"""In-place update of the weights from disk."""
|
1401
|
+
success, message = self.tp_worker.update_weights_from_disk(recv_req)
|
1359
1402
|
if success:
|
1360
1403
|
flash_cache_success = self.flush_cache()
|
1361
1404
|
assert flash_cache_success, "Cache flush failed after updating weights"
|
@@ -1363,6 +1406,27 @@ class Scheduler:
|
|
1363
1406
|
logger.error(message)
|
1364
1407
|
return success, message
|
1365
1408
|
|
1409
|
+
def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
|
1410
|
+
"""Initialize the online model parameter update group."""
|
1411
|
+
success, message = self.tp_worker.init_weights_update_group(recv_req)
|
1412
|
+
return success, message
|
1413
|
+
|
1414
|
+
def update_weights_from_distributed(
|
1415
|
+
self, recv_req: UpdateWeightsFromDistributedReqInput
|
1416
|
+
):
|
1417
|
+
"""Update the online model parameter."""
|
1418
|
+
success, message = self.tp_worker.update_weights_from_distributed(recv_req)
|
1419
|
+
if success:
|
1420
|
+
flash_cache_success = self.flush_cache()
|
1421
|
+
assert flash_cache_success, "Cache flush failed after updating weights"
|
1422
|
+
else:
|
1423
|
+
logger.error(message)
|
1424
|
+
return success, message
|
1425
|
+
|
1426
|
+
def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
|
1427
|
+
parameter = self.tp_worker.get_weights_by_name(recv_req)
|
1428
|
+
return parameter
|
1429
|
+
|
1366
1430
|
def start_profile(self) -> None:
|
1367
1431
|
if self.profiler is None:
|
1368
1432
|
raise RuntimeError("Profiler is not enabled.")
|
@@ -1409,9 +1473,9 @@ def run_scheduler_process(
|
|
1409
1473
|
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
1410
1474
|
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
|
1411
1475
|
|
1412
|
-
# [For Router] if env var "
|
1413
|
-
if dp_rank is None and "
|
1414
|
-
dp_rank = int(os.environ["
|
1476
|
+
# [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
|
1477
|
+
if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
|
1478
|
+
dp_rank = int(os.environ["SGLANG_DP_RANK"])
|
1415
1479
|
|
1416
1480
|
if dp_rank is None:
|
1417
1481
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
@@ -1419,6 +1483,7 @@ def run_scheduler_process(
|
|
1419
1483
|
configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
|
1420
1484
|
|
1421
1485
|
suppress_other_loggers()
|
1486
|
+
parent_process = psutil.Process().parent()
|
1422
1487
|
|
1423
1488
|
try:
|
1424
1489
|
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
|
@@ -1430,6 +1495,6 @@ def run_scheduler_process(
|
|
1430
1495
|
else:
|
1431
1496
|
scheduler.event_loop_normal()
|
1432
1497
|
except Exception:
|
1433
|
-
|
1434
|
-
logger.error(
|
1435
|
-
|
1498
|
+
traceback = get_exception_traceback()
|
1499
|
+
logger.error(f"Scheduler hit an exception: {traceback}")
|
1500
|
+
parent_process.send_signal(signal.SIGQUIT)
|
@@ -10,10 +10,7 @@
|
|
10
10
|
# limitations under the License.
|
11
11
|
# ==============================================================================
|
12
12
|
|
13
|
-
import copy
|
14
13
|
import uuid
|
15
|
-
from dataclasses import dataclass
|
16
|
-
from typing import Optional
|
17
14
|
|
18
15
|
from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
|
19
16
|
from sglang.srt.managers.schedule_batch import FINISH_ABORT, List, Req
|