sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. sglang/bench_offline_throughput.py +6 -6
  2. sglang/bench_one_batch.py +1 -0
  3. sglang/bench_serving.py +9 -1
  4. sglang/check_env.py +140 -48
  5. sglang/lang/backend/runtime_endpoint.py +1 -0
  6. sglang/lang/chat_template.py +32 -0
  7. sglang/llama3_eval.py +316 -0
  8. sglang/srt/aio_rwlock.py +100 -0
  9. sglang/srt/configs/model_config.py +8 -1
  10. sglang/srt/constrained/xgrammar_backend.py +4 -1
  11. sglang/srt/layers/attention/flashinfer_backend.py +51 -5
  12. sglang/srt/layers/attention/triton_backend.py +16 -25
  13. sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
  14. sglang/srt/layers/linear.py +20 -2
  15. sglang/srt/layers/logits_processor.py +133 -95
  16. sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
  17. sglang/srt/layers/moe/fused_moe_native.py +46 -0
  18. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
  19. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
  20. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
  21. sglang/srt/layers/moe/topk.py +191 -0
  22. sglang/srt/layers/quantization/__init__.py +5 -50
  23. sglang/srt/layers/quantization/fp8.py +221 -36
  24. sglang/srt/layers/quantization/fp8_kernel.py +278 -0
  25. sglang/srt/layers/quantization/fp8_utils.py +90 -1
  26. sglang/srt/layers/radix_attention.py +8 -1
  27. sglang/srt/layers/sampler.py +27 -5
  28. sglang/srt/layers/torchao_utils.py +31 -0
  29. sglang/srt/managers/detokenizer_manager.py +37 -17
  30. sglang/srt/managers/io_struct.py +39 -10
  31. sglang/srt/managers/schedule_batch.py +54 -34
  32. sglang/srt/managers/schedule_policy.py +64 -5
  33. sglang/srt/managers/scheduler.py +171 -136
  34. sglang/srt/managers/tokenizer_manager.py +184 -133
  35. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  36. sglang/srt/mem_cache/chunk_cache.py +2 -2
  37. sglang/srt/mem_cache/memory_pool.py +15 -8
  38. sglang/srt/mem_cache/radix_cache.py +12 -2
  39. sglang/srt/model_executor/cuda_graph_runner.py +25 -11
  40. sglang/srt/model_executor/model_runner.py +28 -14
  41. sglang/srt/model_parallel.py +66 -5
  42. sglang/srt/models/dbrx.py +1 -1
  43. sglang/srt/models/deepseek.py +1 -1
  44. sglang/srt/models/deepseek_v2.py +67 -18
  45. sglang/srt/models/gemma2.py +34 -0
  46. sglang/srt/models/gemma2_reward.py +0 -1
  47. sglang/srt/models/granite.py +517 -0
  48. sglang/srt/models/grok.py +73 -9
  49. sglang/srt/models/llama.py +22 -0
  50. sglang/srt/models/llama_classification.py +11 -23
  51. sglang/srt/models/llama_reward.py +0 -2
  52. sglang/srt/models/llava.py +37 -14
  53. sglang/srt/models/mixtral.py +2 -2
  54. sglang/srt/models/olmoe.py +1 -1
  55. sglang/srt/models/qwen2.py +20 -0
  56. sglang/srt/models/qwen2_moe.py +1 -1
  57. sglang/srt/models/xverse_moe.py +1 -1
  58. sglang/srt/openai_api/adapter.py +8 -0
  59. sglang/srt/openai_api/protocol.py +9 -4
  60. sglang/srt/server.py +2 -1
  61. sglang/srt/server_args.py +19 -9
  62. sglang/srt/utils.py +40 -54
  63. sglang/test/test_block_fp8.py +341 -0
  64. sglang/test/test_utils.py +3 -2
  65. sglang/utils.py +10 -3
  66. sglang/version.py +1 -1
  67. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
  68. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
  69. sglang/srt/layers/fused_moe_patch.py +0 -133
  70. /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
  71. /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
  72. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
  73. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
  74. {sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
@@ -22,9 +22,10 @@ import warnings
22
22
  from collections import deque
23
23
  from concurrent import futures
24
24
  from types import SimpleNamespace
25
- from typing import List, Optional
25
+ from typing import Callable, Dict, List, Optional, Tuple
26
26
 
27
27
  import psutil
28
+ import setproctitle
28
29
  import torch
29
30
  import zmq
30
31
 
@@ -259,7 +260,7 @@ class Scheduler:
259
260
  self.current_stream = torch.get_device_module(self.device).current_stream()
260
261
 
261
262
  # Session info
262
- self.sessions = {}
263
+ self.sessions: Dict[str, Session] = {}
263
264
 
264
265
  # Init chunked prefill
265
266
  self.chunked_prefill_size = server_args.chunked_prefill_size
@@ -514,6 +515,9 @@ class Scheduler:
514
515
  recv_req.input_text,
515
516
  recv_req.input_ids,
516
517
  recv_req.sampling_params,
518
+ return_logprob=recv_req.return_logprob,
519
+ top_logprobs_num=recv_req.top_logprobs_num,
520
+ stream=recv_req.stream,
517
521
  lora_path=recv_req.lora_path,
518
522
  input_embeds=recv_req.input_embeds,
519
523
  )
@@ -557,9 +561,6 @@ class Scheduler:
557
561
  return
558
562
 
559
563
  # Copy more attributes
560
- req.return_logprob = recv_req.return_logprob
561
- req.top_logprobs_num = recv_req.top_logprobs_num
562
- req.stream = recv_req.stream
563
564
  req.logprob_start_len = recv_req.logprob_start_len
564
565
 
565
566
  if req.logprob_start_len == -1:
@@ -712,7 +713,7 @@ class Scheduler:
712
713
  if crash_on_warnings():
713
714
  raise ValueError(msg)
714
715
 
715
- def get_next_batch_to_run(self):
716
+ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
716
717
  # Merge the prefill batch into the running batch
717
718
  if self.last_batch and self.last_batch.forward_mode.is_extend():
718
719
  if self.being_chunked_req:
@@ -944,6 +945,7 @@ class Scheduler:
944
945
  batch.next_batch_sampling_info.sampling_info_done.set()
945
946
 
946
947
  def process_batch_result_prefill(self, batch: ScheduleBatch, result):
948
+ skip_stream_req = None
947
949
 
948
950
  if self.is_generation:
949
951
  logits_output, next_token_ids, bid = result
@@ -980,7 +982,6 @@ class Scheduler:
980
982
  continue
981
983
 
982
984
  if req.is_being_chunked <= 0:
983
- req.completion_tokens_wo_jump_forward += 1
984
985
  req.output_ids.append(next_token_id)
985
986
  req.check_finished()
986
987
 
@@ -1000,6 +1001,10 @@ class Scheduler:
1000
1001
  else:
1001
1002
  # being chunked reqs' prefill is not finished
1002
1003
  req.is_being_chunked -= 1
1004
+ # There is only at most one request being currently chunked.
1005
+ # Because this request does not finish prefill,
1006
+ # we don't want to stream the request currently being chunked.
1007
+ skip_stream_req = req
1003
1008
 
1004
1009
  if batch.next_batch_sampling_info:
1005
1010
  batch.next_batch_sampling_info.update_regex_vocab_mask()
@@ -1029,7 +1034,7 @@ class Scheduler:
1029
1034
  # being chunked reqs' prefill is not finished
1030
1035
  req.is_being_chunked -= 1
1031
1036
 
1032
- self.stream_output(batch.reqs)
1037
+ self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
1033
1038
 
1034
1039
  def process_batch_result_decode(self, batch: ScheduleBatch, result):
1035
1040
  logits_output, next_token_ids, bid = result
@@ -1059,7 +1064,6 @@ class Scheduler:
1059
1064
  self.token_to_kv_pool.free(batch.out_cache_loc[i : i + 1])
1060
1065
  continue
1061
1066
 
1062
- req.completion_tokens_wo_jump_forward += 1
1063
1067
  req.output_ids.append(next_token_id)
1064
1068
  req.check_finished()
1065
1069
 
@@ -1067,11 +1071,15 @@ class Scheduler:
1067
1071
  self.tree_cache.cache_finished_req(req)
1068
1072
 
1069
1073
  if req.return_logprob:
1070
- req.output_token_logprobs.append(
1071
- (next_token_logprobs[i], next_token_id)
1072
- )
1074
+ req.output_token_logprobs_val.append(next_token_logprobs[i])
1075
+ req.output_token_logprobs_idx.append(next_token_id)
1073
1076
  if req.top_logprobs_num > 0:
1074
- req.output_top_logprobs.append(logits_output.output_top_logprobs[i])
1077
+ req.output_top_logprobs_val.append(
1078
+ logits_output.output_top_logprobs_val[i]
1079
+ )
1080
+ req.output_top_logprobs_idx.append(
1081
+ logits_output.output_top_logprobs_idx[i]
1082
+ )
1075
1083
 
1076
1084
  if req.grammar is not None:
1077
1085
  req.grammar.accept_token(next_token_id)
@@ -1082,7 +1090,7 @@ class Scheduler:
1082
1090
  self.current_stream.synchronize()
1083
1091
  batch.next_batch_sampling_info.sampling_info_done.set()
1084
1092
 
1085
- self.stream_output(batch.reqs)
1093
+ self.stream_output(batch.reqs, batch.return_logprob)
1086
1094
 
1087
1095
  self.token_to_kv_pool.free_group_end()
1088
1096
 
@@ -1102,9 +1110,8 @@ class Scheduler:
1102
1110
  output: LogitsProcessorOutput,
1103
1111
  ):
1104
1112
  """Attach logprobs to the return values."""
1105
- req.output_token_logprobs.append(
1106
- (output.next_token_logprobs[i], next_token_ids[i])
1107
- )
1113
+ req.output_token_logprobs_val.append(output.next_token_logprobs[i])
1114
+ req.output_token_logprobs_idx.append(next_token_ids[i])
1108
1115
 
1109
1116
  # If logprob_start_len > 0, then first logprob_start_len prompt tokens will be ignored.
1110
1117
  num_input_logprobs = req.extend_input_len - req.extend_logprob_start_len
@@ -1112,170 +1119,196 @@ class Scheduler:
1112
1119
  if req.normalized_prompt_logprob is None:
1113
1120
  req.normalized_prompt_logprob = output.normalized_prompt_logprobs[i]
1114
1121
 
1115
- if req.input_token_logprobs is None:
1116
- input_token_logprobs = output.input_token_logprobs[
1122
+ if req.input_token_logprobs_val is None:
1123
+ input_token_logprobs_val = output.input_token_logprobs[
1117
1124
  pt : pt + num_input_logprobs - 1 - req.last_update_decode_tokens
1118
1125
  ]
1119
- input_token_ids = req.fill_ids[
1126
+
1127
+ input_token_logprobs_idx = req.fill_ids[
1120
1128
  len(req.fill_ids)
1121
1129
  - num_input_logprobs
1122
1130
  + 1 : len(req.fill_ids)
1123
1131
  - req.last_update_decode_tokens
1124
1132
  ]
1125
-
1126
1133
  # Clip the padded hash values from image tokens.
1127
1134
  # Otherwise, it will lead to detokenization errors.
1128
- input_token_ids = [
1135
+ input_token_logprobs_idx = [
1129
1136
  x if x < self.model_config.vocab_size - 1 else 0
1130
- for x in input_token_ids
1137
+ for x in input_token_logprobs_idx
1131
1138
  ]
1132
1139
 
1133
- req.input_token_logprobs = list(zip(input_token_logprobs, input_token_ids))
1134
-
1135
1140
  if (
1136
1141
  req.logprob_start_len == 0
1137
1142
  ): # The first token does not have logprob, pad it.
1138
- req.input_token_logprobs = [
1139
- (None, req.fill_ids[0])
1140
- ] + req.input_token_logprobs
1143
+ input_token_logprobs_val = [None] + input_token_logprobs_val
1144
+ input_token_logprobs_idx = [req.fill_ids[0]] + input_token_logprobs_idx
1145
+
1146
+ req.input_token_logprobs_val = input_token_logprobs_val
1147
+ req.input_token_logprobs_idx = input_token_logprobs_idx
1141
1148
 
1142
1149
  if req.last_update_decode_tokens != 0:
1143
1150
  # Some decode tokens are re-computed in an extend batch
1144
- req.output_token_logprobs.extend(
1145
- list(
1146
- zip(
1147
- output.input_token_logprobs[
1148
- pt
1149
- + num_input_logprobs
1150
- - 1
1151
- - req.last_update_decode_tokens : pt
1152
- + num_input_logprobs
1153
- - 1
1154
- ],
1155
- req.fill_ids[
1156
- len(req.fill_ids)
1157
- - req.last_update_decode_tokens : len(req.fill_ids)
1158
- ],
1159
- )
1160
- )
1151
+ req.output_token_logprobs_val.extend(
1152
+ output.input_token_logprobs[
1153
+ pt
1154
+ + num_input_logprobs
1155
+ - 1
1156
+ - req.last_update_decode_tokens : pt
1157
+ + num_input_logprobs
1158
+ - 1
1159
+ ],
1160
+ )
1161
+ req.output_token_logprobs_idx.extend(
1162
+ req.fill_ids[
1163
+ len(req.fill_ids)
1164
+ - req.last_update_decode_tokens : len(req.fill_ids)
1165
+ ]
1161
1166
  )
1162
1167
 
1163
1168
  if req.top_logprobs_num > 0:
1164
- if req.input_top_logprobs is None:
1165
- req.input_top_logprobs = output.input_top_logprobs[i]
1169
+ if req.input_top_logprobs_val is None:
1170
+ req.input_top_logprobs_val = output.input_top_logprobs_val[i]
1171
+ req.input_top_logprobs_idx = output.input_top_logprobs_idx[i]
1166
1172
  if req.logprob_start_len == 0:
1167
- req.input_top_logprobs = [None] + req.input_top_logprobs
1173
+ req.input_top_logprobs_val = [None] + req.input_top_logprobs_val
1174
+ req.input_top_logprobs_idx = [None] + req.input_top_logprobs_idx
1168
1175
 
1169
1176
  if req.last_update_decode_tokens != 0:
1170
- req.output_top_logprobs.extend(
1171
- output.input_top_logprobs[i][-req.last_update_decode_tokens :]
1177
+ req.output_top_logprobs_val.extend(
1178
+ output.input_top_logprobs_val[i][-req.last_update_decode_tokens :]
1172
1179
  )
1173
- req.output_top_logprobs.append(output.output_top_logprobs[i])
1180
+ req.output_top_logprobs_idx.extend(
1181
+ output.input_top_logprobs_idx[i][-req.last_update_decode_tokens :]
1182
+ )
1183
+ req.output_top_logprobs_val.append(output.output_top_logprobs_val[i])
1184
+ req.output_top_logprobs_idx.append(output.output_top_logprobs_idx[i])
1174
1185
 
1175
1186
  return num_input_logprobs
1176
1187
 
1177
- def stream_output(self, reqs: List[Req]):
1188
+ def stream_output(
1189
+ self, reqs: List[Req], return_logprob: bool, skip_req: Optional[Req] = None
1190
+ ):
1178
1191
  """Stream the output to detokenizer."""
1179
- output_rids = []
1180
- output_meta_info: List[dict] = []
1181
- output_finished_reason: List[BaseFinishReason] = []
1192
+ rids = []
1193
+ finished_reasons: List[BaseFinishReason] = []
1194
+
1182
1195
  if self.is_generation:
1183
- output_vids = []
1196
+ vids = []
1184
1197
  decoded_texts = []
1185
- output_read_ids = []
1186
- output_read_offsets = []
1198
+ decode_ids_list = []
1199
+ read_offsets = []
1187
1200
  output_ids = []
1188
- output_skip_special_tokens = []
1189
- output_spaces_between_special_tokens = []
1190
- output_no_stop_trim = []
1191
- else: # embedding or reward model
1192
- output_embeddings = []
1193
1201
 
1194
- is_stream_iter = self.forward_ct_decode % self.stream_interval == 0
1202
+ skip_special_tokens = []
1203
+ spaces_between_special_tokens = []
1204
+ no_stop_trim = []
1205
+ prompt_tokens = []
1206
+ completion_tokens = []
1207
+ cached_tokens = []
1208
+
1209
+ if return_logprob:
1210
+ input_token_logprobs_val = []
1211
+ input_token_logprobs_idx = []
1212
+ output_token_logprobs_val = []
1213
+ output_token_logprobs_idx = []
1214
+ input_top_logprobs_val = []
1215
+ input_top_logprobs_idx = []
1216
+ output_top_logprobs_val = []
1217
+ output_top_logprobs_idx = []
1218
+ normalized_prompt_logprob = []
1219
+ else:
1220
+ input_token_logprobs_val = input_token_logprobs_idx = (
1221
+ output_token_logprobs_val
1222
+ ) = output_token_logprobs_idx = input_top_logprobs_val = (
1223
+ input_top_logprobs_idx
1224
+ ) = output_top_logprobs_val = output_top_logprobs_idx = (
1225
+ normalized_prompt_logprob
1226
+ ) = None
1227
+
1228
+ for req in reqs:
1229
+ if req is skip_req:
1230
+ continue
1195
1231
 
1196
- for req in reqs:
1197
- # TODO(lianmin): revisit this for overlap + retract + stream
1198
- if req.finished() or (
1199
- req.stream and (is_stream_iter or len(req.output_ids) == 1)
1200
- ):
1201
- output_rids.append(req.rid)
1202
- output_finished_reason.append(req.finished_reason)
1203
- if self.is_generation:
1204
- output_vids.append(req.vid)
1232
+ # TODO(lianmin): revisit this for overlap + retract + stream
1233
+ if (
1234
+ req.finished()
1235
+ # If stream, follow the given stream_interval
1236
+ or (req.stream and len(req.output_ids) % self.stream_interval == 0)
1237
+ # If not stream, we still want to output some tokens to get the benefit of incremental decoding.
1238
+ or (not req.stream and len(req.output_ids) % 50 == 0)
1239
+ ):
1240
+ rids.append(req.rid)
1241
+ finished_reasons.append(
1242
+ req.finished_reason.to_json() if req.finished_reason else None
1243
+ )
1244
+ vids.append(req.vid)
1205
1245
  decoded_texts.append(req.decoded_text)
1206
- read_ids, read_offset = req.init_incremental_detokenize()
1207
- output_read_ids.append(read_ids)
1208
- output_read_offsets.append(read_offset)
1246
+ decode_ids, read_offset = req.init_incremental_detokenize()
1247
+ decode_ids_list.append(decode_ids)
1248
+ read_offsets.append(read_offset)
1209
1249
  if self.skip_tokenizer_init:
1210
1250
  output_ids.append(req.output_ids)
1211
- output_skip_special_tokens.append(
1212
- req.sampling_params.skip_special_tokens
1213
- )
1214
- output_spaces_between_special_tokens.append(
1251
+ skip_special_tokens.append(req.sampling_params.skip_special_tokens)
1252
+ spaces_between_special_tokens.append(
1215
1253
  req.sampling_params.spaces_between_special_tokens
1216
1254
  )
1217
- output_no_stop_trim.append(req.sampling_params.no_stop_trim)
1218
-
1219
- meta_info = {
1220
- "prompt_tokens": len(req.origin_input_ids),
1221
- "completion_tokens": len(req.output_ids),
1222
- "completion_tokens_wo_jump_forward": req.completion_tokens_wo_jump_forward,
1223
- "cached_tokens": req.cached_tokens,
1224
- "finish_reason": (
1225
- req.finished_reason.to_json()
1226
- if req.finished_reason is not None
1227
- else None
1228
- ),
1229
- }
1230
- if req.return_logprob:
1231
- (
1232
- meta_info["input_token_logprobs"],
1233
- meta_info["output_token_logprobs"],
1234
- meta_info["input_top_logprobs"],
1235
- meta_info["output_top_logprobs"],
1236
- meta_info["normalized_prompt_logprob"],
1237
- ) = (
1238
- req.input_token_logprobs,
1239
- req.output_token_logprobs,
1240
- req.input_top_logprobs,
1241
- req.output_top_logprobs,
1242
- req.normalized_prompt_logprob,
1243
- )
1244
- output_meta_info.append(meta_info)
1245
- else: # embedding or reward model
1246
- output_embeddings.append(req.embedding)
1247
- meta_info = {
1248
- "prompt_tokens": len(req.origin_input_ids),
1249
- }
1250
- output_meta_info.append(meta_info)
1251
-
1252
- # Send to detokenizer
1253
- if output_rids:
1254
- if self.is_generation:
1255
+ no_stop_trim.append(req.sampling_params.no_stop_trim)
1256
+
1257
+ prompt_tokens.append(len(req.origin_input_ids))
1258
+ completion_tokens.append(len(req.output_ids))
1259
+ cached_tokens.append(req.cached_tokens)
1260
+
1261
+ if return_logprob:
1262
+ input_token_logprobs_val.append(req.input_token_logprobs_val)
1263
+ input_token_logprobs_idx.append(req.input_token_logprobs_idx)
1264
+ output_token_logprobs_val.append(req.output_token_logprobs_val)
1265
+ output_token_logprobs_idx.append(req.output_token_logprobs_idx)
1266
+ input_top_logprobs_val.append(req.input_top_logprobs_val)
1267
+ input_top_logprobs_idx.append(req.input_top_logprobs_idx)
1268
+ output_top_logprobs_val.append(req.output_top_logprobs_val)
1269
+ output_top_logprobs_idx.append(req.output_top_logprobs_idx)
1270
+ normalized_prompt_logprob.append(req.normalized_prompt_logprob)
1271
+
1272
+ # Send to detokenizer
1273
+ if rids:
1255
1274
  self.send_to_detokenizer.send_pyobj(
1256
1275
  BatchTokenIDOut(
1257
- output_rids,
1258
- output_vids,
1276
+ rids,
1277
+ finished_reasons,
1278
+ vids,
1259
1279
  decoded_texts,
1260
- output_read_ids,
1261
- output_read_offsets,
1280
+ decode_ids_list,
1281
+ read_offsets,
1262
1282
  output_ids,
1263
- output_skip_special_tokens,
1264
- output_spaces_between_special_tokens,
1265
- output_meta_info,
1266
- output_finished_reason,
1267
- output_no_stop_trim,
1268
- )
1269
- )
1270
- else: # embedding or reward model
1271
- self.send_to_detokenizer.send_pyobj(
1272
- BatchEmbeddingOut(
1273
- output_rids,
1274
- output_embeddings,
1275
- output_meta_info,
1276
- output_finished_reason,
1283
+ skip_special_tokens,
1284
+ spaces_between_special_tokens,
1285
+ no_stop_trim,
1286
+ prompt_tokens,
1287
+ completion_tokens,
1288
+ cached_tokens,
1289
+ input_token_logprobs_val,
1290
+ input_token_logprobs_idx,
1291
+ output_token_logprobs_val,
1292
+ output_token_logprobs_idx,
1293
+ input_top_logprobs_val,
1294
+ input_top_logprobs_idx,
1295
+ output_top_logprobs_val,
1296
+ output_top_logprobs_idx,
1297
+ normalized_prompt_logprob,
1277
1298
  )
1278
1299
  )
1300
+ else: # embedding or reward model
1301
+ embeddings = []
1302
+ prompt_tokens = []
1303
+ for req in reqs:
1304
+ assert req.finished()
1305
+ rids.append(req.rid)
1306
+ finished_reasons.append(req.finished_reason.to_json())
1307
+ embeddings.append(req.embedding)
1308
+ prompt_tokens.append(len(req.origin_input_ids))
1309
+ self.send_to_detokenizer.send_pyobj(
1310
+ BatchEmbeddingOut(rids, finished_reasons, embeddings, prompt_tokens)
1311
+ )
1279
1312
 
1280
1313
  def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
1281
1314
  # Check if other DP workers have running batches
@@ -1473,6 +1506,8 @@ def run_scheduler_process(
1473
1506
  dp_rank: Optional[int],
1474
1507
  pipe_writer,
1475
1508
  ):
1509
+ setproctitle.setproctitle("sglang::scheduler")
1510
+
1476
1511
  # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
1477
1512
  if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
1478
1513
  dp_rank = int(os.environ["SGLANG_DP_RANK"])