sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +6 -6
- sglang/bench_one_batch.py +1 -0
- sglang/bench_serving.py +9 -1
- sglang/check_env.py +140 -48
- sglang/lang/backend/runtime_endpoint.py +1 -0
- sglang/lang/chat_template.py +32 -0
- sglang/llama3_eval.py +316 -0
- sglang/srt/aio_rwlock.py +100 -0
- sglang/srt/configs/model_config.py +8 -1
- sglang/srt/constrained/xgrammar_backend.py +4 -1
- sglang/srt/layers/attention/flashinfer_backend.py +51 -5
- sglang/srt/layers/attention/triton_backend.py +16 -25
- sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
- sglang/srt/layers/linear.py +20 -2
- sglang/srt/layers/logits_processor.py +133 -95
- sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
- sglang/srt/layers/moe/fused_moe_native.py +46 -0
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
- sglang/srt/layers/moe/topk.py +191 -0
- sglang/srt/layers/quantization/__init__.py +5 -50
- sglang/srt/layers/quantization/fp8.py +221 -36
- sglang/srt/layers/quantization/fp8_kernel.py +278 -0
- sglang/srt/layers/quantization/fp8_utils.py +90 -1
- sglang/srt/layers/radix_attention.py +8 -1
- sglang/srt/layers/sampler.py +27 -5
- sglang/srt/layers/torchao_utils.py +31 -0
- sglang/srt/managers/detokenizer_manager.py +37 -17
- sglang/srt/managers/io_struct.py +39 -10
- sglang/srt/managers/schedule_batch.py +54 -34
- sglang/srt/managers/schedule_policy.py +64 -5
- sglang/srt/managers/scheduler.py +171 -136
- sglang/srt/managers/tokenizer_manager.py +184 -133
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +2 -2
- sglang/srt/mem_cache/memory_pool.py +15 -8
- sglang/srt/mem_cache/radix_cache.py +12 -2
- sglang/srt/model_executor/cuda_graph_runner.py +25 -11
- sglang/srt/model_executor/model_runner.py +28 -14
- sglang/srt/model_parallel.py +66 -5
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +67 -18
- sglang/srt/models/gemma2.py +34 -0
- sglang/srt/models/gemma2_reward.py +0 -1
- sglang/srt/models/granite.py +517 -0
- sglang/srt/models/grok.py +73 -9
- sglang/srt/models/llama.py +22 -0
- sglang/srt/models/llama_classification.py +11 -23
- sglang/srt/models/llama_reward.py +0 -2
- sglang/srt/models/llava.py +37 -14
- sglang/srt/models/mixtral.py +2 -2
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen2.py +20 -0
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/xverse_moe.py +1 -1
- sglang/srt/openai_api/adapter.py +8 -0
- sglang/srt/openai_api/protocol.py +9 -4
- sglang/srt/server.py +2 -1
- sglang/srt/server_args.py +19 -9
- sglang/srt/utils.py +40 -54
- sglang/test/test_block_fp8.py +341 -0
- sglang/test/test_utils.py +3 -2
- sglang/utils.py +10 -3
- sglang/version.py +1 -1
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
- sglang/srt/layers/fused_moe_patch.py +0 -133
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
- {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
sglang/srt/managers/scheduler.py
CHANGED
@@ -22,9 +22,10 @@ import warnings
|
|
22
22
|
from collections import deque
|
23
23
|
from concurrent import futures
|
24
24
|
from types import SimpleNamespace
|
25
|
-
from typing import List, Optional
|
25
|
+
from typing import Callable, Dict, List, Optional, Tuple
|
26
26
|
|
27
27
|
import psutil
|
28
|
+
import setproctitle
|
28
29
|
import torch
|
29
30
|
import zmq
|
30
31
|
|
@@ -259,7 +260,7 @@ class Scheduler:
|
|
259
260
|
self.current_stream = torch.get_device_module(self.device).current_stream()
|
260
261
|
|
261
262
|
# Session info
|
262
|
-
self.sessions = {}
|
263
|
+
self.sessions: Dict[str, Session] = {}
|
263
264
|
|
264
265
|
# Init chunked prefill
|
265
266
|
self.chunked_prefill_size = server_args.chunked_prefill_size
|
@@ -514,6 +515,9 @@ class Scheduler:
|
|
514
515
|
recv_req.input_text,
|
515
516
|
recv_req.input_ids,
|
516
517
|
recv_req.sampling_params,
|
518
|
+
return_logprob=recv_req.return_logprob,
|
519
|
+
top_logprobs_num=recv_req.top_logprobs_num,
|
520
|
+
stream=recv_req.stream,
|
517
521
|
lora_path=recv_req.lora_path,
|
518
522
|
input_embeds=recv_req.input_embeds,
|
519
523
|
)
|
@@ -557,9 +561,6 @@ class Scheduler:
|
|
557
561
|
return
|
558
562
|
|
559
563
|
# Copy more attributes
|
560
|
-
req.return_logprob = recv_req.return_logprob
|
561
|
-
req.top_logprobs_num = recv_req.top_logprobs_num
|
562
|
-
req.stream = recv_req.stream
|
563
564
|
req.logprob_start_len = recv_req.logprob_start_len
|
564
565
|
|
565
566
|
if req.logprob_start_len == -1:
|
@@ -712,7 +713,7 @@ class Scheduler:
|
|
712
713
|
if crash_on_warnings():
|
713
714
|
raise ValueError(msg)
|
714
715
|
|
715
|
-
def get_next_batch_to_run(self):
|
716
|
+
def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
|
716
717
|
# Merge the prefill batch into the running batch
|
717
718
|
if self.last_batch and self.last_batch.forward_mode.is_extend():
|
718
719
|
if self.being_chunked_req:
|
@@ -944,6 +945,7 @@ class Scheduler:
|
|
944
945
|
batch.next_batch_sampling_info.sampling_info_done.set()
|
945
946
|
|
946
947
|
def process_batch_result_prefill(self, batch: ScheduleBatch, result):
|
948
|
+
skip_stream_req = None
|
947
949
|
|
948
950
|
if self.is_generation:
|
949
951
|
logits_output, next_token_ids, bid = result
|
@@ -980,7 +982,6 @@ class Scheduler:
|
|
980
982
|
continue
|
981
983
|
|
982
984
|
if req.is_being_chunked <= 0:
|
983
|
-
req.completion_tokens_wo_jump_forward += 1
|
984
985
|
req.output_ids.append(next_token_id)
|
985
986
|
req.check_finished()
|
986
987
|
|
@@ -1000,6 +1001,10 @@ class Scheduler:
|
|
1000
1001
|
else:
|
1001
1002
|
# being chunked reqs' prefill is not finished
|
1002
1003
|
req.is_being_chunked -= 1
|
1004
|
+
# There is only at most one request being currently chunked.
|
1005
|
+
# Because this request does not finish prefill,
|
1006
|
+
# we don't want to stream the request currently being chunked.
|
1007
|
+
skip_stream_req = req
|
1003
1008
|
|
1004
1009
|
if batch.next_batch_sampling_info:
|
1005
1010
|
batch.next_batch_sampling_info.update_regex_vocab_mask()
|
@@ -1029,7 +1034,7 @@ class Scheduler:
|
|
1029
1034
|
# being chunked reqs' prefill is not finished
|
1030
1035
|
req.is_being_chunked -= 1
|
1031
1036
|
|
1032
|
-
self.stream_output(batch.reqs)
|
1037
|
+
self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
|
1033
1038
|
|
1034
1039
|
def process_batch_result_decode(self, batch: ScheduleBatch, result):
|
1035
1040
|
logits_output, next_token_ids, bid = result
|
@@ -1059,7 +1064,6 @@ class Scheduler:
|
|
1059
1064
|
self.token_to_kv_pool.free(batch.out_cache_loc[i : i + 1])
|
1060
1065
|
continue
|
1061
1066
|
|
1062
|
-
req.completion_tokens_wo_jump_forward += 1
|
1063
1067
|
req.output_ids.append(next_token_id)
|
1064
1068
|
req.check_finished()
|
1065
1069
|
|
@@ -1067,11 +1071,15 @@ class Scheduler:
|
|
1067
1071
|
self.tree_cache.cache_finished_req(req)
|
1068
1072
|
|
1069
1073
|
if req.return_logprob:
|
1070
|
-
req.
|
1071
|
-
|
1072
|
-
)
|
1074
|
+
req.output_token_logprobs_val.append(next_token_logprobs[i])
|
1075
|
+
req.output_token_logprobs_idx.append(next_token_id)
|
1073
1076
|
if req.top_logprobs_num > 0:
|
1074
|
-
req.
|
1077
|
+
req.output_top_logprobs_val.append(
|
1078
|
+
logits_output.output_top_logprobs_val[i]
|
1079
|
+
)
|
1080
|
+
req.output_top_logprobs_idx.append(
|
1081
|
+
logits_output.output_top_logprobs_idx[i]
|
1082
|
+
)
|
1075
1083
|
|
1076
1084
|
if req.grammar is not None:
|
1077
1085
|
req.grammar.accept_token(next_token_id)
|
@@ -1082,7 +1090,7 @@ class Scheduler:
|
|
1082
1090
|
self.current_stream.synchronize()
|
1083
1091
|
batch.next_batch_sampling_info.sampling_info_done.set()
|
1084
1092
|
|
1085
|
-
self.stream_output(batch.reqs)
|
1093
|
+
self.stream_output(batch.reqs, batch.return_logprob)
|
1086
1094
|
|
1087
1095
|
self.token_to_kv_pool.free_group_end()
|
1088
1096
|
|
@@ -1102,9 +1110,8 @@ class Scheduler:
|
|
1102
1110
|
output: LogitsProcessorOutput,
|
1103
1111
|
):
|
1104
1112
|
"""Attach logprobs to the return values."""
|
1105
|
-
req.
|
1106
|
-
|
1107
|
-
)
|
1113
|
+
req.output_token_logprobs_val.append(output.next_token_logprobs[i])
|
1114
|
+
req.output_token_logprobs_idx.append(next_token_ids[i])
|
1108
1115
|
|
1109
1116
|
# If logprob_start_len > 0, then first logprob_start_len prompt tokens will be ignored.
|
1110
1117
|
num_input_logprobs = req.extend_input_len - req.extend_logprob_start_len
|
@@ -1112,170 +1119,196 @@ class Scheduler:
|
|
1112
1119
|
if req.normalized_prompt_logprob is None:
|
1113
1120
|
req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i]
|
1114
1121
|
|
1115
|
-
if req.
|
1116
|
-
|
1122
|
+
if req.input_token_logprobs_val is None:
|
1123
|
+
input_token_logprobs_val = output.input_token_logprobs[
|
1117
1124
|
pt : pt + num_input_logprobs - 1 - req.last_update_decode_tokens
|
1118
1125
|
]
|
1119
|
-
|
1126
|
+
|
1127
|
+
input_token_logprobs_idx = req.fill_ids[
|
1120
1128
|
len(req.fill_ids)
|
1121
1129
|
- num_input_logprobs
|
1122
1130
|
+ 1 : len(req.fill_ids)
|
1123
1131
|
- req.last_update_decode_tokens
|
1124
1132
|
]
|
1125
|
-
|
1126
1133
|
# Clip the padded hash values from image tokens.
|
1127
1134
|
# Otherwise, it will lead to detokenization errors.
|
1128
|
-
|
1135
|
+
input_token_logprobs_idx = [
|
1129
1136
|
x if x < self.model_config.vocab_size - 1 else 0
|
1130
|
-
for x in
|
1137
|
+
for x in input_token_logprobs_idx
|
1131
1138
|
]
|
1132
1139
|
|
1133
|
-
req.input_token_logprobs = list(zip(input_token_logprobs, input_token_ids))
|
1134
|
-
|
1135
1140
|
if (
|
1136
1141
|
req.logprob_start_len == 0
|
1137
1142
|
): # The first token does not have logprob, pad it.
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1143
|
+
input_token_logprobs_val = [None] + input_token_logprobs_val
|
1144
|
+
input_token_logprobs_idx = [req.fill_ids[0]] + input_token_logprobs_idx
|
1145
|
+
|
1146
|
+
req.input_token_logprobs_val = input_token_logprobs_val
|
1147
|
+
req.input_token_logprobs_idx = input_token_logprobs_idx
|
1141
1148
|
|
1142
1149
|
if req.last_update_decode_tokens != 0:
|
1143
1150
|
# Some decode tokens are re-computed in an extend batch
|
1144
|
-
req.
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
)
|
1160
|
-
)
|
1151
|
+
req.output_token_logprobs_val.extend(
|
1152
|
+
output.input_token_logprobs[
|
1153
|
+
pt
|
1154
|
+
+ num_input_logprobs
|
1155
|
+
- 1
|
1156
|
+
- req.last_update_decode_tokens : pt
|
1157
|
+
+ num_input_logprobs
|
1158
|
+
- 1
|
1159
|
+
],
|
1160
|
+
)
|
1161
|
+
req.output_token_logprobs_idx.extend(
|
1162
|
+
req.fill_ids[
|
1163
|
+
len(req.fill_ids)
|
1164
|
+
- req.last_update_decode_tokens : len(req.fill_ids)
|
1165
|
+
]
|
1161
1166
|
)
|
1162
1167
|
|
1163
1168
|
if req.top_logprobs_num > 0:
|
1164
|
-
if req.
|
1165
|
-
req.
|
1169
|
+
if req.input_top_logprobs_val is None:
|
1170
|
+
req.input_top_logprobs_val = output.input_top_logprobs_val[i]
|
1171
|
+
req.input_top_logprobs_idx = output.input_top_logprobs_idx[i]
|
1166
1172
|
if req.logprob_start_len == 0:
|
1167
|
-
req.
|
1173
|
+
req.input_top_logprobs_val = [None] + req.input_top_logprobs_val
|
1174
|
+
req.input_top_logprobs_idx = [None] + req.input_top_logprobs_idx
|
1168
1175
|
|
1169
1176
|
if req.last_update_decode_tokens != 0:
|
1170
|
-
req.
|
1171
|
-
output.
|
1177
|
+
req.output_top_logprobs_val.extend(
|
1178
|
+
output.input_top_logprobs_val[i][-req.last_update_decode_tokens :]
|
1172
1179
|
)
|
1173
|
-
|
1180
|
+
req.output_top_logprobs_idx.extend(
|
1181
|
+
output.input_top_logprobs_idx[i][-req.last_update_decode_tokens :]
|
1182
|
+
)
|
1183
|
+
req.output_top_logprobs_val.append(output.output_top_logprobs_val[i])
|
1184
|
+
req.output_top_logprobs_idx.append(output.output_top_logprobs_idx[i])
|
1174
1185
|
|
1175
1186
|
return num_input_logprobs
|
1176
1187
|
|
1177
|
-
def stream_output(
|
1188
|
+
def stream_output(
|
1189
|
+
self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
|
1190
|
+
):
|
1178
1191
|
"""Stream the output to detokenizer."""
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1192
|
+
rids = []
|
1193
|
+
finished_reasons: List[BaseFinishReason] = []
|
1194
|
+
|
1182
1195
|
if self.is_generation:
|
1183
|
-
|
1196
|
+
vids = []
|
1184
1197
|
decoded_texts = []
|
1185
|
-
|
1186
|
-
|
1198
|
+
decode_ids_list = []
|
1199
|
+
read_offsets = []
|
1187
1200
|
output_ids = []
|
1188
|
-
output_skip_special_tokens = []
|
1189
|
-
output_spaces_between_special_tokens = []
|
1190
|
-
output_no_stop_trim = []
|
1191
|
-
else: # embedding or reward model
|
1192
|
-
output_embeddings = []
|
1193
1201
|
|
1194
|
-
|
1202
|
+
skip_special_tokens = []
|
1203
|
+
spaces_between_special_tokens = []
|
1204
|
+
no_stop_trim = []
|
1205
|
+
prompt_tokens = []
|
1206
|
+
completion_tokens = []
|
1207
|
+
cached_tokens = []
|
1208
|
+
|
1209
|
+
if return_logprob:
|
1210
|
+
input_token_logprobs_val = []
|
1211
|
+
input_token_logprobs_idx = []
|
1212
|
+
output_token_logprobs_val = []
|
1213
|
+
output_token_logprobs_idx = []
|
1214
|
+
input_top_logprobs_val = []
|
1215
|
+
input_top_logprobs_idx = []
|
1216
|
+
output_top_logprobs_val = []
|
1217
|
+
output_top_logprobs_idx = []
|
1218
|
+
normalized_prompt_logprob = []
|
1219
|
+
else:
|
1220
|
+
input_token_logprobs_val = input_token_logprobs_idx = (
|
1221
|
+
output_token_logprobs_val
|
1222
|
+
) = output_token_logprobs_idx = input_top_logprobs_val = (
|
1223
|
+
input_top_logprobs_idx
|
1224
|
+
) = output_top_logprobs_val = output_top_logprobs_idx = (
|
1225
|
+
normalized_prompt_logprob
|
1226
|
+
) = None
|
1227
|
+
|
1228
|
+
for req in reqs:
|
1229
|
+
if req is skip_req:
|
1230
|
+
continue
|
1195
1231
|
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1232
|
+
# TODO(lianmin): revisit this for overlap + retract + stream
|
1233
|
+
if (
|
1234
|
+
req.finished()
|
1235
|
+
# If stream, follow the given stream_interval
|
1236
|
+
or (req.stream and len(req.output_ids) % self.stream_interval == 0)
|
1237
|
+
# If not stream, we still want to output some tokens to get the benefit of incremental decoding.
|
1238
|
+
or (not req.stream and len(req.output_ids) % 50 == 0)
|
1239
|
+
):
|
1240
|
+
rids.append(req.rid)
|
1241
|
+
finished_reasons.append(
|
1242
|
+
req.finished_reason.to_json() if req.finished_reason else None
|
1243
|
+
)
|
1244
|
+
vids.append(req.vid)
|
1205
1245
|
decoded_texts.append(req.decoded_text)
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1246
|
+
decode_ids, read_offset = req.init_incremental_detokenize()
|
1247
|
+
decode_ids_list.append(decode_ids)
|
1248
|
+
read_offsets.append(read_offset)
|
1209
1249
|
if self.skip_tokenizer_init:
|
1210
1250
|
output_ids.append(req.output_ids)
|
1211
|
-
|
1212
|
-
|
1213
|
-
)
|
1214
|
-
output_spaces_between_special_tokens.append(
|
1251
|
+
skip_special_tokens.append(req.sampling_params.skip_special_tokens)
|
1252
|
+
spaces_between_special_tokens.append(
|
1215
1253
|
req.sampling_params.spaces_between_special_tokens
|
1216
1254
|
)
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
)
|
1229
|
-
|
1230
|
-
|
1231
|
-
(
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
meta_info["normalized_prompt_logprob"],
|
1237
|
-
) = (
|
1238
|
-
req.input_token_logprobs,
|
1239
|
-
req.output_token_logprobs,
|
1240
|
-
req.input_top_logprobs,
|
1241
|
-
req.output_top_logprobs,
|
1242
|
-
req.normalized_prompt_logprob,
|
1243
|
-
)
|
1244
|
-
output_meta_info.append(meta_info)
|
1245
|
-
else: # embedding or reward model
|
1246
|
-
output_embeddings.append(req.embedding)
|
1247
|
-
meta_info = {
|
1248
|
-
"prompt_tokens": len(req.origin_input_ids),
|
1249
|
-
}
|
1250
|
-
output_meta_info.append(meta_info)
|
1251
|
-
|
1252
|
-
# Send to detokenizer
|
1253
|
-
if output_rids:
|
1254
|
-
if self.is_generation:
|
1255
|
+
no_stop_trim.append(req.sampling_params.no_stop_trim)
|
1256
|
+
|
1257
|
+
prompt_tokens.append(len(req.origin_input_ids))
|
1258
|
+
completion_tokens.append(len(req.output_ids))
|
1259
|
+
cached_tokens.append(req.cached_tokens)
|
1260
|
+
|
1261
|
+
if return_logprob:
|
1262
|
+
input_token_logprobs_val.append(req.input_token_logprobs_val)
|
1263
|
+
input_token_logprobs_idx.append(req.input_token_logprobs_idx)
|
1264
|
+
output_token_logprobs_val.append(req.output_token_logprobs_val)
|
1265
|
+
output_token_logprobs_idx.append(req.output_token_logprobs_idx)
|
1266
|
+
input_top_logprobs_val.append(req.input_top_logprobs_val)
|
1267
|
+
input_top_logprobs_idx.append(req.input_top_logprobs_idx)
|
1268
|
+
output_top_logprobs_val.append(req.output_top_logprobs_val)
|
1269
|
+
output_top_logprobs_idx.append(req.output_top_logprobs_idx)
|
1270
|
+
normalized_prompt_logprob.append(req.normalized_prompt_logprob)
|
1271
|
+
|
1272
|
+
# Send to detokenizer
|
1273
|
+
if rids:
|
1255
1274
|
self.send_to_detokenizer.send_pyobj(
|
1256
1275
|
BatchTokenIDOut(
|
1257
|
-
|
1258
|
-
|
1276
|
+
rids,
|
1277
|
+
finished_reasons,
|
1278
|
+
vids,
|
1259
1279
|
decoded_texts,
|
1260
|
-
|
1261
|
-
|
1280
|
+
decode_ids_list,
|
1281
|
+
read_offsets,
|
1262
1282
|
output_ids,
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1283
|
+
skip_special_tokens,
|
1284
|
+
spaces_between_special_tokens,
|
1285
|
+
no_stop_trim,
|
1286
|
+
prompt_tokens,
|
1287
|
+
completion_tokens,
|
1288
|
+
cached_tokens,
|
1289
|
+
input_token_logprobs_val,
|
1290
|
+
input_token_logprobs_idx,
|
1291
|
+
output_token_logprobs_val,
|
1292
|
+
output_token_logprobs_idx,
|
1293
|
+
input_top_logprobs_val,
|
1294
|
+
input_top_logprobs_idx,
|
1295
|
+
output_top_logprobs_val,
|
1296
|
+
output_top_logprobs_idx,
|
1297
|
+
normalized_prompt_logprob,
|
1277
1298
|
)
|
1278
1299
|
)
|
1300
|
+
else: # embedding or reward model
|
1301
|
+
embeddings = []
|
1302
|
+
prompt_tokens = []
|
1303
|
+
for req in reqs:
|
1304
|
+
assert req.finished()
|
1305
|
+
rids.append(req.rid)
|
1306
|
+
finished_reasons.append(req.finished_reason.to_json())
|
1307
|
+
embeddings.append(req.embedding)
|
1308
|
+
prompt_tokens.append(len(req.origin_input_ids))
|
1309
|
+
self.send_to_detokenizer.send_pyobj(
|
1310
|
+
BatchEmbeddingOut(rids, finished_reasons, embeddings, prompt_tokens)
|
1311
|
+
)
|
1279
1312
|
|
1280
1313
|
def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
|
1281
1314
|
# Check if other DP workers have running batches
|
@@ -1473,6 +1506,8 @@ def run_scheduler_process(
|
|
1473
1506
|
dp_rank: Optional[int],
|
1474
1507
|
pipe_writer,
|
1475
1508
|
):
|
1509
|
+
setproctitle.setproctitle("sglang::scheduler")
|
1510
|
+
|
1476
1511
|
# [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
|
1477
1512
|
if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
|
1478
1513
|
dp_rank = int(os.environ["SGLANG_DP_RANK"])
|