sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_offline_throughput.py +4 -2
  2. sglang/bench_one_batch.py +3 -13
  3. sglang/bench_one_batch_server.py +143 -15
  4. sglang/bench_serving.py +158 -8
  5. sglang/compile_deep_gemm.py +1 -1
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +119 -75
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +5 -2
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/internvl.py +696 -0
  13. sglang/srt/configs/janus_pro.py +3 -0
  14. sglang/srt/configs/model_config.py +18 -0
  15. sglang/srt/constrained/base_grammar_backend.py +55 -72
  16. sglang/srt/constrained/llguidance_backend.py +25 -21
  17. sglang/srt/constrained/outlines_backend.py +27 -26
  18. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  19. sglang/srt/constrained/xgrammar_backend.py +71 -53
  20. sglang/srt/conversation.py +78 -46
  21. sglang/srt/disaggregation/base/conn.py +1 -0
  22. sglang/srt/disaggregation/decode.py +11 -3
  23. sglang/srt/disaggregation/fake/conn.py +1 -1
  24. sglang/srt/disaggregation/mini_lb.py +74 -23
  25. sglang/srt/disaggregation/mooncake/conn.py +236 -138
  26. sglang/srt/disaggregation/nixl/conn.py +242 -71
  27. sglang/srt/disaggregation/prefill.py +7 -4
  28. sglang/srt/disaggregation/utils.py +51 -2
  29. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  30. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  31. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  32. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  33. sglang/srt/distributed/parallel_state.py +22 -1
  34. sglang/srt/entrypoints/engine.py +31 -4
  35. sglang/srt/entrypoints/http_server.py +45 -3
  36. sglang/srt/entrypoints/verl_engine.py +3 -2
  37. sglang/srt/function_call_parser.py +2 -2
  38. sglang/srt/hf_transformers_utils.py +20 -1
  39. sglang/srt/layers/attention/flashattention_backend.py +147 -51
  40. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  41. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  42. sglang/srt/layers/attention/merge_state.py +46 -0
  43. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  44. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  45. sglang/srt/layers/attention/utils.py +4 -2
  46. sglang/srt/layers/attention/vision.py +290 -163
  47. sglang/srt/layers/dp_attention.py +71 -21
  48. sglang/srt/layers/layernorm.py +1 -1
  49. sglang/srt/layers/logits_processor.py +46 -11
  50. sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
  51. sglang/srt/layers/moe/ep_moe/layer.py +121 -2
  52. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  56. sglang/srt/layers/moe/topk.py +1 -1
  57. sglang/srt/layers/quantization/__init__.py +1 -1
  58. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  59. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  60. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  61. sglang/srt/layers/quantization/deep_gemm.py +77 -71
  62. sglang/srt/layers/quantization/fp8.py +110 -97
  63. sglang/srt/layers/quantization/fp8_kernel.py +81 -62
  64. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  65. sglang/srt/layers/quantization/int8_kernel.py +2 -2
  66. sglang/srt/layers/quantization/kv_cache.py +3 -10
  67. sglang/srt/layers/quantization/utils.py +0 -5
  68. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  69. sglang/srt/layers/sampler.py +0 -4
  70. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  71. sglang/srt/lora/lora_manager.py +11 -14
  72. sglang/srt/lora/mem_pool.py +4 -4
  73. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  74. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  75. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  76. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  77. sglang/srt/lora/utils.py +1 -1
  78. sglang/srt/managers/cache_controller.py +115 -119
  79. sglang/srt/managers/data_parallel_controller.py +3 -3
  80. sglang/srt/managers/detokenizer_manager.py +21 -8
  81. sglang/srt/managers/io_struct.py +13 -1
  82. sglang/srt/managers/mm_utils.py +1 -1
  83. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  84. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  85. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  86. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  87. sglang/srt/managers/schedule_batch.py +93 -23
  88. sglang/srt/managers/schedule_policy.py +11 -8
  89. sglang/srt/managers/scheduler.py +140 -100
  90. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  91. sglang/srt/managers/tokenizer_manager.py +157 -47
  92. sglang/srt/managers/tp_worker.py +21 -21
  93. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  94. sglang/srt/mem_cache/chunk_cache.py +2 -0
  95. sglang/srt/mem_cache/memory_pool.py +4 -2
  96. sglang/srt/metrics/collector.py +312 -37
  97. sglang/srt/model_executor/cuda_graph_runner.py +10 -11
  98. sglang/srt/model_executor/forward_batch_info.py +1 -1
  99. sglang/srt/model_executor/model_runner.py +57 -41
  100. sglang/srt/model_loader/loader.py +18 -11
  101. sglang/srt/models/clip.py +4 -4
  102. sglang/srt/models/deepseek_janus_pro.py +3 -3
  103. sglang/srt/models/deepseek_nextn.py +1 -20
  104. sglang/srt/models/deepseek_v2.py +77 -39
  105. sglang/srt/models/gemma3_mm.py +1 -1
  106. sglang/srt/models/internlm2.py +3 -0
  107. sglang/srt/models/internvl.py +670 -0
  108. sglang/srt/models/llama.py +3 -1
  109. sglang/srt/models/llama4.py +58 -13
  110. sglang/srt/models/llava.py +248 -5
  111. sglang/srt/models/minicpmv.py +1 -1
  112. sglang/srt/models/mixtral.py +98 -34
  113. sglang/srt/models/mllama.py +1 -1
  114. sglang/srt/models/phi3_small.py +16 -2
  115. sglang/srt/models/pixtral.py +467 -0
  116. sglang/srt/models/qwen2_5_vl.py +8 -4
  117. sglang/srt/models/qwen2_vl.py +4 -4
  118. sglang/srt/models/roberta.py +1 -1
  119. sglang/srt/models/torch_native_llama.py +1 -1
  120. sglang/srt/models/xiaomi_mimo.py +171 -0
  121. sglang/srt/openai_api/adapter.py +52 -42
  122. sglang/srt/openai_api/protocol.py +20 -16
  123. sglang/srt/reasoning_parser.py +1 -1
  124. sglang/srt/sampling/custom_logit_processor.py +18 -3
  125. sglang/srt/sampling/sampling_batch_info.py +2 -2
  126. sglang/srt/sampling/sampling_params.py +2 -0
  127. sglang/srt/server_args.py +64 -10
  128. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  129. sglang/srt/speculative/eagle_utils.py +7 -7
  130. sglang/srt/speculative/eagle_worker.py +22 -19
  131. sglang/srt/utils.py +41 -6
  132. sglang/test/few_shot_gsm8k.py +2 -2
  133. sglang/test/few_shot_gsm8k_engine.py +2 -2
  134. sglang/test/run_eval.py +2 -2
  135. sglang/test/runners.py +8 -1
  136. sglang/test/send_one.py +13 -3
  137. sglang/test/simple_eval_common.py +1 -1
  138. sglang/test/simple_eval_humaneval.py +1 -1
  139. sglang/test/test_block_fp8.py +2 -2
  140. sglang/test/test_deepep_utils.py +219 -0
  141. sglang/test/test_programs.py +5 -5
  142. sglang/test/test_utils.py +92 -15
  143. sglang/utils.py +1 -1
  144. sglang/version.py +1 -1
  145. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
  146. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
  147. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
  148. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  149. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -75,7 +75,8 @@ class PyNcclCommunicator:
75
75
  self.available = True
76
76
  self.disabled = False
77
77
 
78
- logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
78
+ if self.rank == 0:
79
+ logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
79
80
 
80
81
  if self.rank == 0:
81
82
  # get the unique id from NCCL
@@ -225,7 +225,8 @@ class MessageQueue:
225
225
  remote_subscribe_port = get_open_port()
226
226
  if is_valid_ipv6_address(connect_ip):
227
227
  self.remote_socket.setsockopt(IPV6, 1)
228
- socket_addr = f"tcp://*:{remote_subscribe_port}"
228
+ connect_ip = f"[{connect_ip}]"
229
+ socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
229
230
  self.remote_socket.bind(socket_addr)
230
231
 
231
232
  else:
@@ -42,6 +42,7 @@ from torch.distributed import Backend, ProcessGroup
42
42
  from sglang.srt.utils import (
43
43
  direct_register_custom_op,
44
44
  is_cuda_alike,
45
+ is_npu,
45
46
  supports_custom_op,
46
47
  )
47
48
 
@@ -206,6 +207,7 @@ class GroupCoordinator:
206
207
  use_custom_allreduce: bool,
207
208
  use_hpu_communicator: bool,
208
209
  use_xpu_communicator: bool,
210
+ use_npu_communicator: bool,
209
211
  use_message_queue_broadcaster: bool = False,
210
212
  group_name: Optional[str] = None,
211
213
  ):
@@ -244,6 +246,7 @@ class GroupCoordinator:
244
246
  self.use_custom_allreduce = use_custom_allreduce
245
247
  self.use_hpu_communicator = use_hpu_communicator
246
248
  self.use_xpu_communicator = use_xpu_communicator
249
+ self.use_npu_communicator = use_npu_communicator
247
250
  self.use_message_queue_broadcaster = use_message_queue_broadcaster
248
251
 
249
252
  # lazy import to avoid documentation build error
@@ -291,6 +294,14 @@ class GroupCoordinator:
291
294
  if use_xpu_communicator and self.world_size > 1:
292
295
  self.xpu_communicator = XpuCommunicator(group=self.device_group)
293
296
 
297
+ from sglang.srt.distributed.device_communicators.npu_communicator import (
298
+ NpuCommunicator,
299
+ )
300
+
301
+ self.npu_communicator: Optional[NpuCommunicator] = None
302
+ if use_npu_communicator and self.world_size > 1:
303
+ self.npu_communicator = NpuCommunicator(group=self.device_group)
304
+
294
305
  from sglang.srt.distributed.device_communicators.shm_broadcast import (
295
306
  MessageQueue,
296
307
  )
@@ -418,6 +429,9 @@ class GroupCoordinator:
418
429
  if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
419
430
  return self.xpu_communicator.all_reduce(input_)
420
431
 
432
+ if self.npu_communicator is not None and not self.npu_communicator.disabled:
433
+ return self.npu_communicator.all_reduce(input_)
434
+
421
435
  if (
422
436
  self.ca_comm is not None
423
437
  and not self.ca_comm.disabled
@@ -497,6 +511,11 @@ class GroupCoordinator:
497
511
  if hpu_comm is not None and not hpu_comm.disabled:
498
512
  return hpu_comm.all_gather(input_, dim)
499
513
 
514
+ # For NPUs, use NPU communicator.
515
+ npu_comm = self.npu_communicator
516
+ if npu_comm is not None and not npu_comm.disabled:
517
+ return npu_comm.all_gather(input_, dim)
518
+
500
519
  if dim < 0:
501
520
  # Convert negative dim to positive.
502
521
  dim += input_.dim()
@@ -941,6 +960,7 @@ def init_world_group(
941
960
  use_custom_allreduce=False,
942
961
  use_hpu_communicator=False,
943
962
  use_xpu_communicator=False,
963
+ use_npu_communicator=False,
944
964
  group_name="world",
945
965
  )
946
966
 
@@ -959,10 +979,11 @@ def init_model_parallel_group(
959
979
  group_ranks=group_ranks,
960
980
  local_rank=local_rank,
961
981
  torch_distributed_backend=backend,
962
- use_pynccl=True,
982
+ use_pynccl=not is_npu(),
963
983
  use_custom_allreduce=use_custom_allreduce,
964
984
  use_hpu_communicator=True,
965
985
  use_xpu_communicator=True,
986
+ use_npu_communicator=True,
966
987
  use_message_queue_broadcaster=use_message_queue_broadcaster,
967
988
  group_name=group_name,
968
989
  )
@@ -163,6 +163,9 @@ class Engine(EngineBase):
163
163
  custom_logit_processor: Optional[Union[List[str], str]] = None,
164
164
  return_hidden_states: bool = False,
165
165
  stream: bool = False,
166
+ bootstrap_host: Optional[Union[List[str], str]] = None,
167
+ bootstrap_port: Optional[Union[List[int], int]] = None,
168
+ bootstrap_room: Optional[Union[List[int], int]] = None,
166
169
  ) -> Union[Dict, Iterator[Dict]]:
167
170
  """
168
171
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
@@ -181,6 +184,9 @@ class Engine(EngineBase):
181
184
  custom_logit_processor=custom_logit_processor,
182
185
  return_hidden_states=return_hidden_states,
183
186
  stream=stream,
187
+ bootstrap_host=bootstrap_host,
188
+ bootstrap_port=bootstrap_port,
189
+ bootstrap_room=bootstrap_room,
184
190
  )
185
191
  loop = asyncio.get_event_loop()
186
192
  generator = self.tokenizer_manager.generate_request(obj, None)
@@ -227,6 +233,9 @@ class Engine(EngineBase):
227
233
  lora_path: Optional[List[Optional[str]]] = None,
228
234
  custom_logit_processor: Optional[Union[List[str], str]] = None,
229
235
  stream: bool = False,
236
+ bootstrap_host: Optional[Union[List[str], str]] = None,
237
+ bootstrap_port: Optional[Union[List[int], int]] = None,
238
+ bootstrap_room: Optional[Union[List[int], int]] = None,
230
239
  ) -> Union[Dict, AsyncIterator[Dict]]:
231
240
  """
232
241
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
@@ -244,6 +253,9 @@ class Engine(EngineBase):
244
253
  lora_path=lora_path,
245
254
  stream=stream,
246
255
  custom_logit_processor=custom_logit_processor,
256
+ bootstrap_host=bootstrap_host,
257
+ bootstrap_port=bootstrap_port,
258
+ bootstrap_room=bootstrap_room,
247
259
  )
248
260
  generator = self.tokenizer_manager.generate_request(obj, None)
249
261
 
@@ -273,6 +285,21 @@ class Engine(EngineBase):
273
285
  ret = loop.run_until_complete(generator.__anext__())
274
286
  return ret
275
287
 
288
+ async def async_encode(
289
+ self,
290
+ prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
291
+ image_data: Optional[Union[List[str], str]] = None,
292
+ ) -> Dict:
293
+ """
294
+ Asynchronous version of encode method.
295
+
296
+ The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
297
+ Please refer to `EmbeddingReqInput` for the documentation.
298
+ """
299
+ obj = EmbeddingReqInput(text=prompt, image_data=image_data)
300
+ generator = self.tokenizer_manager.generate_request(obj, None)
301
+ return await generator.__anext__()
302
+
276
303
  def shutdown(self):
277
304
  """Shutdown the engine"""
278
305
  kill_process_tree(os.getpid(), include_parent=False)
@@ -303,7 +330,7 @@ class Engine(EngineBase):
303
330
  return {
304
331
  **dataclasses.asdict(self.tokenizer_manager.server_args),
305
332
  **self.scheduler_info,
306
- **internal_states,
333
+ "internal_states": internal_states,
307
334
  "version": __version__,
308
335
  }
309
336
 
@@ -348,8 +375,8 @@ class Engine(EngineBase):
348
375
  load_format: Optional[str] = None,
349
376
  flush_cache: bool = True,
350
377
  ):
351
- """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be true
352
- to avoid duplicated operations such as clearing cache."""
378
+ """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
379
+ to avoid duplicated cache cleaning operation."""
353
380
  obj = UpdateWeightsFromTensorReqInput(
354
381
  serialized_named_tensors=[
355
382
  MultiprocessingSerializer.serialize(named_tensors)
@@ -459,7 +486,7 @@ def _set_envs_and_config(server_args: ServerArgs):
459
486
  if _is_cuda:
460
487
  assert_pkg_version(
461
488
  "sgl-kernel",
462
- "0.1.1",
489
+ "0.1.2.post1",
463
490
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
464
491
  )
465
492
 
@@ -42,10 +42,14 @@ from fastapi import FastAPI, File, Form, Request, UploadFile
42
42
  from fastapi.middleware.cors import CORSMiddleware
43
43
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
44
44
 
45
- from sglang.srt.disaggregation.utils import FakeBootstrapHost
45
+ from sglang.srt.disaggregation.utils import (
46
+ FakeBootstrapHost,
47
+ register_disaggregation_server,
48
+ )
46
49
  from sglang.srt.entrypoints.engine import _launch_subprocesses
47
50
  from sglang.srt.function_call_parser import FunctionCallParser
48
51
  from sglang.srt.managers.io_struct import (
52
+ AbortReq,
49
53
  CloseSessionReqInput,
50
54
  ConfigureLoggingReq,
51
55
  EmbeddingReqInput,
@@ -59,6 +63,7 @@ from sglang.srt.managers.io_struct import (
59
63
  ResumeMemoryOccupationReqInput,
60
64
  SeparateReasoningReqInput,
61
65
  SetInternalStateReq,
66
+ SlowDownReqInput,
62
67
  UpdateWeightFromDiskReqInput,
63
68
  UpdateWeightsFromDistributedReqInput,
64
69
  UpdateWeightsFromTensorReqInput,
@@ -217,7 +222,7 @@ async def get_server_info():
217
222
  return {
218
223
  **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
219
224
  **_global_state.scheduler_info,
220
- **internal_states,
225
+ "internal_states": internal_states,
221
226
  "version": __version__,
222
227
  }
223
228
 
@@ -333,7 +338,11 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
333
338
  obj = ProfileReqInput()
334
339
 
335
340
  await _global_state.tokenizer_manager.start_profile(
336
- obj.output_dir, obj.num_steps, obj.activities
341
+ output_dir=obj.output_dir,
342
+ num_steps=obj.num_steps,
343
+ activities=obj.activities,
344
+ with_stack=obj.with_stack,
345
+ record_shapes=obj.record_shapes,
337
346
  )
338
347
  return Response(
339
348
  content="Start profiling.\n",
@@ -491,6 +500,19 @@ async def resume_memory_occupation(
491
500
  return _create_error_response(e)
492
501
 
493
502
 
503
+ @app.api_route("/slow_down", methods=["GET", "POST"])
504
+ async def slow_down(obj: SlowDownReqInput, request: Request):
505
+ """Slow down the system deliberately. Only for testing. Example scenario:
506
+ when we want to test performance of D in large-scale PD disaggregation and have no enough nodes for P,
507
+ we can use this to slow down D to let it have enough running sequences, and then disable slowdown
508
+ to let it run in full batch size.
509
+ """
510
+ try:
511
+ await _global_state.tokenizer_manager.slow_down(obj, request)
512
+ except Exception as e:
513
+ return _create_error_response(e)
514
+
515
+
494
516
  @app.api_route("/open_session", methods=["GET", "POST"])
495
517
  async def open_session(obj: OpenSessionReqInput, request: Request):
496
518
  """Open a session, and return its unique session id."""
@@ -522,6 +544,16 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
522
544
  return Response(status_code=200)
523
545
 
524
546
 
547
+ @app.post("/abort_request")
548
+ async def abort_request(obj: AbortReq, request: Request):
549
+ """Abort a request."""
550
+ try:
551
+ _global_state.tokenizer_manager.abort_request(rid=obj.rid)
552
+ return Response(status_code=200)
553
+ except Exception as e:
554
+ return _create_error_response(e)
555
+
556
+
525
557
  @app.post("/parse_function_call")
526
558
  async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
527
559
  """
@@ -675,6 +707,8 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
675
707
  **(vertex_req.parameters or {}),
676
708
  )
677
709
  ret = await generate_request(req, raw_request)
710
+ if isinstance(ret, Response):
711
+ return ret
678
712
  return ORJSONResponse({"predictions": ret})
679
713
 
680
714
 
@@ -869,5 +903,13 @@ def _wait_and_warmup(
869
903
  if server_args.debug_tensor_dump_input_file:
870
904
  kill_process_tree(os.getpid())
871
905
 
906
+ if server_args.pdlb_url is not None:
907
+ register_disaggregation_server(
908
+ server_args.disaggregation_mode,
909
+ server_args.port,
910
+ server_args.disaggregation_bootstrap_port,
911
+ server_args.pdlb_url,
912
+ )
913
+
872
914
  if launch_callback is not None:
873
915
  launch_callback()
@@ -37,6 +37,7 @@ class VerlEngine:
37
37
  monkey_patch_torch_reductions()
38
38
  self._device_mesh_cpu = device_mesh_cpu
39
39
  self._tp_rank = device_mesh_cpu.get_local_rank()
40
+ self._rank = device_mesh_cpu.get_rank()
40
41
  self._tp_size = device_mesh_cpu.size()
41
42
  tp_size_per_node = self._tp_size // nnodes
42
43
  node_rank = self._tp_rank // tp_size_per_node
@@ -114,7 +115,7 @@ class VerlEngine:
114
115
  # Most naive implementation, can extract tensor and send via gloo if too slow
115
116
  [output] = broadcast_pyobj(
116
117
  data=[output],
117
- rank=self._tp_rank,
118
+ rank=self._rank,
118
119
  dist_group=self._device_mesh_cpu.get_group(),
119
120
  src=self._device_mesh_cpu.mesh[0].item(),
120
121
  force_cpu_device=False,
@@ -157,7 +158,7 @@ class VerlEngine:
157
158
  )
158
159
 
159
160
  if self._tp_rank == 0:
160
- self._engine.tokenizer_manager.flush_cache()
161
+ self._engine.flush_cache()
161
162
 
162
163
  def release_memory_occupation(self):
163
164
  if self._tp_rank == 0:
@@ -86,8 +86,8 @@ class StructureInfo:
86
86
 
87
87
  _GetInfoFunc = Callable[[str], StructureInfo]
88
88
  """
89
- helper alias of function
90
- ususally it is a function that takes a name string and returns a StructureInfo object,
89
+ Helper alias of function
90
+ Usually it is a function that takes a name string and returns a StructureInfo object,
91
91
  which can be used to construct a structural_tag object
92
92
  """
93
93
 
@@ -19,6 +19,7 @@ import warnings
19
19
  from pathlib import Path
20
20
  from typing import Dict, Optional, Type, Union
21
21
 
22
+ import transformers
22
23
  from huggingface_hub import snapshot_download
23
24
  from transformers import (
24
25
  AutoConfig,
@@ -26,6 +27,7 @@ from transformers import (
26
27
  AutoTokenizer,
27
28
  PretrainedConfig,
28
29
  PreTrainedTokenizer,
30
+ PreTrainedTokenizerBase,
29
31
  PreTrainedTokenizerFast,
30
32
  )
31
33
  from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
@@ -38,6 +40,7 @@ from sglang.srt.configs import (
38
40
  KimiVLConfig,
39
41
  MultiModalityConfig,
40
42
  )
43
+ from sglang.srt.configs.internvl import InternVLChatConfig
41
44
  from sglang.srt.connector import create_remote_connector
42
45
  from sglang.srt.utils import is_remote_url
43
46
 
@@ -48,6 +51,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
48
51
  DeepseekVL2Config.model_type: DeepseekVL2Config,
49
52
  MultiModalityConfig.model_type: MultiModalityConfig,
50
53
  KimiVLConfig.model_type: KimiVLConfig,
54
+ InternVLChatConfig.model_type: InternVLChatConfig,
51
55
  }
52
56
 
53
57
  for name, cls in _CONFIG_REGISTRY.items():
@@ -90,6 +94,12 @@ def get_config(
90
94
  config = config_class.from_pretrained(model, revision=revision)
91
95
  # NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
92
96
  setattr(config, "_name_or_path", model)
97
+
98
+ if isinstance(model, str) and config.model_type == "internvl_chat":
99
+ for key, val in config.llm_config.__dict__.items():
100
+ if not hasattr(config, key):
101
+ setattr(config, key, val)
102
+
93
103
  if model_override_args:
94
104
  config.update(model_override_args)
95
105
 
@@ -211,6 +221,13 @@ def get_tokenizer(
211
221
  return tokenizer
212
222
 
213
223
 
224
+ # Some models doesn't have an available processor, e.g.: InternVL
225
+ def get_tokenizer_from_processor(processor):
226
+ if isinstance(processor, PreTrainedTokenizerBase):
227
+ return processor
228
+ return processor.tokenizer
229
+
230
+
214
231
  def get_processor(
215
232
  tokenizer_name: str,
216
233
  *args,
@@ -246,7 +263,9 @@ def get_processor(
246
263
  **kwargs,
247
264
  )
248
265
 
249
- attach_additional_stop_token_ids(processor.tokenizer)
266
+ tokenizer = get_tokenizer_from_processor(processor)
267
+
268
+ attach_additional_stop_token_ids(tokenizer)
250
269
  return processor
251
270
 
252
271