sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +4 -2
- sglang/bench_one_batch.py +3 -13
- sglang/bench_one_batch_server.py +143 -15
- sglang/bench_serving.py +158 -8
- sglang/compile_deep_gemm.py +1 -1
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +119 -75
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +5 -2
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +18 -0
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +71 -53
- sglang/srt/conversation.py +78 -46
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +11 -3
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +236 -138
- sglang/srt/disaggregation/nixl/conn.py +242 -71
- sglang/srt/disaggregation/prefill.py +7 -4
- sglang/srt/disaggregation/utils.py +51 -2
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +31 -4
- sglang/srt/entrypoints/http_server.py +45 -3
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/function_call_parser.py +2 -2
- sglang/srt/hf_transformers_utils.py +20 -1
- sglang/srt/layers/attention/flashattention_backend.py +147 -51
- sglang/srt/layers/attention/flashinfer_backend.py +23 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/utils.py +4 -2
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/dp_attention.py +71 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
- sglang/srt/layers/moe/ep_moe/layer.py +121 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +77 -71
- sglang/srt/layers/quantization/fp8.py +110 -97
- sglang/srt/layers/quantization/fp8_kernel.py +81 -62
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/int8_kernel.py +2 -2
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +11 -14
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/io_struct.py +13 -1
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/schedule_batch.py +93 -23
- sglang/srt/managers/schedule_policy.py +11 -8
- sglang/srt/managers/scheduler.py +140 -100
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/tokenizer_manager.py +157 -47
- sglang/srt/managers/tp_worker.py +21 -21
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +4 -2
- sglang/srt/metrics/collector.py +312 -37
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +1 -1
- sglang/srt/model_executor/model_runner.py +57 -41
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +3 -3
- sglang/srt/models/deepseek_nextn.py +1 -20
- sglang/srt/models/deepseek_v2.py +77 -39
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/llama.py +3 -1
- sglang/srt/models/llama4.py +58 -13
- sglang/srt/models/llava.py +248 -5
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +52 -42
- sglang/srt/openai_api/protocol.py +20 -16
- sglang/srt/reasoning_parser.py +1 -1
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +2 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +64 -10
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +7 -7
- sglang/srt/speculative/eagle_worker.py +22 -19
- sglang/srt/utils.py +41 -6
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +92 -15
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -75,7 +75,8 @@ class PyNcclCommunicator:
|
|
75
75
|
self.available = True
|
76
76
|
self.disabled = False
|
77
77
|
|
78
|
-
|
78
|
+
if self.rank == 0:
|
79
|
+
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
|
79
80
|
|
80
81
|
if self.rank == 0:
|
81
82
|
# get the unique id from NCCL
|
@@ -225,7 +225,8 @@ class MessageQueue:
|
|
225
225
|
remote_subscribe_port = get_open_port()
|
226
226
|
if is_valid_ipv6_address(connect_ip):
|
227
227
|
self.remote_socket.setsockopt(IPV6, 1)
|
228
|
-
|
228
|
+
connect_ip = f"[{connect_ip}]"
|
229
|
+
socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
|
229
230
|
self.remote_socket.bind(socket_addr)
|
230
231
|
|
231
232
|
else:
|
@@ -42,6 +42,7 @@ from torch.distributed import Backend, ProcessGroup
|
|
42
42
|
from sglang.srt.utils import (
|
43
43
|
direct_register_custom_op,
|
44
44
|
is_cuda_alike,
|
45
|
+
is_npu,
|
45
46
|
supports_custom_op,
|
46
47
|
)
|
47
48
|
|
@@ -206,6 +207,7 @@ class GroupCoordinator:
|
|
206
207
|
use_custom_allreduce: bool,
|
207
208
|
use_hpu_communicator: bool,
|
208
209
|
use_xpu_communicator: bool,
|
210
|
+
use_npu_communicator: bool,
|
209
211
|
use_message_queue_broadcaster: bool = False,
|
210
212
|
group_name: Optional[str] = None,
|
211
213
|
):
|
@@ -244,6 +246,7 @@ class GroupCoordinator:
|
|
244
246
|
self.use_custom_allreduce = use_custom_allreduce
|
245
247
|
self.use_hpu_communicator = use_hpu_communicator
|
246
248
|
self.use_xpu_communicator = use_xpu_communicator
|
249
|
+
self.use_npu_communicator = use_npu_communicator
|
247
250
|
self.use_message_queue_broadcaster = use_message_queue_broadcaster
|
248
251
|
|
249
252
|
# lazy import to avoid documentation build error
|
@@ -291,6 +294,14 @@ class GroupCoordinator:
|
|
291
294
|
if use_xpu_communicator and self.world_size > 1:
|
292
295
|
self.xpu_communicator = XpuCommunicator(group=self.device_group)
|
293
296
|
|
297
|
+
from sglang.srt.distributed.device_communicators.npu_communicator import (
|
298
|
+
NpuCommunicator,
|
299
|
+
)
|
300
|
+
|
301
|
+
self.npu_communicator: Optional[NpuCommunicator] = None
|
302
|
+
if use_npu_communicator and self.world_size > 1:
|
303
|
+
self.npu_communicator = NpuCommunicator(group=self.device_group)
|
304
|
+
|
294
305
|
from sglang.srt.distributed.device_communicators.shm_broadcast import (
|
295
306
|
MessageQueue,
|
296
307
|
)
|
@@ -418,6 +429,9 @@ class GroupCoordinator:
|
|
418
429
|
if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
|
419
430
|
return self.xpu_communicator.all_reduce(input_)
|
420
431
|
|
432
|
+
if self.npu_communicator is not None and not self.npu_communicator.disabled:
|
433
|
+
return self.npu_communicator.all_reduce(input_)
|
434
|
+
|
421
435
|
if (
|
422
436
|
self.ca_comm is not None
|
423
437
|
and not self.ca_comm.disabled
|
@@ -497,6 +511,11 @@ class GroupCoordinator:
|
|
497
511
|
if hpu_comm is not None and not hpu_comm.disabled:
|
498
512
|
return hpu_comm.all_gather(input_, dim)
|
499
513
|
|
514
|
+
# For NPUs, use NPU communicator.
|
515
|
+
npu_comm = self.npu_communicator
|
516
|
+
if npu_comm is not None and not npu_comm.disabled:
|
517
|
+
return npu_comm.all_gather(input_, dim)
|
518
|
+
|
500
519
|
if dim < 0:
|
501
520
|
# Convert negative dim to positive.
|
502
521
|
dim += input_.dim()
|
@@ -941,6 +960,7 @@ def init_world_group(
|
|
941
960
|
use_custom_allreduce=False,
|
942
961
|
use_hpu_communicator=False,
|
943
962
|
use_xpu_communicator=False,
|
963
|
+
use_npu_communicator=False,
|
944
964
|
group_name="world",
|
945
965
|
)
|
946
966
|
|
@@ -959,10 +979,11 @@ def init_model_parallel_group(
|
|
959
979
|
group_ranks=group_ranks,
|
960
980
|
local_rank=local_rank,
|
961
981
|
torch_distributed_backend=backend,
|
962
|
-
use_pynccl=
|
982
|
+
use_pynccl=not is_npu(),
|
963
983
|
use_custom_allreduce=use_custom_allreduce,
|
964
984
|
use_hpu_communicator=True,
|
965
985
|
use_xpu_communicator=True,
|
986
|
+
use_npu_communicator=True,
|
966
987
|
use_message_queue_broadcaster=use_message_queue_broadcaster,
|
967
988
|
group_name=group_name,
|
968
989
|
)
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -163,6 +163,9 @@ class Engine(EngineBase):
|
|
163
163
|
custom_logit_processor: Optional[Union[List[str], str]] = None,
|
164
164
|
return_hidden_states: bool = False,
|
165
165
|
stream: bool = False,
|
166
|
+
bootstrap_host: Optional[Union[List[str], str]] = None,
|
167
|
+
bootstrap_port: Optional[Union[List[int], int]] = None,
|
168
|
+
bootstrap_room: Optional[Union[List[int], int]] = None,
|
166
169
|
) -> Union[Dict, Iterator[Dict]]:
|
167
170
|
"""
|
168
171
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
|
@@ -181,6 +184,9 @@ class Engine(EngineBase):
|
|
181
184
|
custom_logit_processor=custom_logit_processor,
|
182
185
|
return_hidden_states=return_hidden_states,
|
183
186
|
stream=stream,
|
187
|
+
bootstrap_host=bootstrap_host,
|
188
|
+
bootstrap_port=bootstrap_port,
|
189
|
+
bootstrap_room=bootstrap_room,
|
184
190
|
)
|
185
191
|
loop = asyncio.get_event_loop()
|
186
192
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
@@ -227,6 +233,9 @@ class Engine(EngineBase):
|
|
227
233
|
lora_path: Optional[List[Optional[str]]] = None,
|
228
234
|
custom_logit_processor: Optional[Union[List[str], str]] = None,
|
229
235
|
stream: bool = False,
|
236
|
+
bootstrap_host: Optional[Union[List[str], str]] = None,
|
237
|
+
bootstrap_port: Optional[Union[List[int], int]] = None,
|
238
|
+
bootstrap_room: Optional[Union[List[int], int]] = None,
|
230
239
|
) -> Union[Dict, AsyncIterator[Dict]]:
|
231
240
|
"""
|
232
241
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
|
@@ -244,6 +253,9 @@ class Engine(EngineBase):
|
|
244
253
|
lora_path=lora_path,
|
245
254
|
stream=stream,
|
246
255
|
custom_logit_processor=custom_logit_processor,
|
256
|
+
bootstrap_host=bootstrap_host,
|
257
|
+
bootstrap_port=bootstrap_port,
|
258
|
+
bootstrap_room=bootstrap_room,
|
247
259
|
)
|
248
260
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
249
261
|
|
@@ -273,6 +285,21 @@ class Engine(EngineBase):
|
|
273
285
|
ret = loop.run_until_complete(generator.__anext__())
|
274
286
|
return ret
|
275
287
|
|
288
|
+
async def async_encode(
|
289
|
+
self,
|
290
|
+
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
291
|
+
image_data: Optional[Union[List[str], str]] = None,
|
292
|
+
) -> Dict:
|
293
|
+
"""
|
294
|
+
Asynchronous version of encode method.
|
295
|
+
|
296
|
+
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
|
297
|
+
Please refer to `EmbeddingReqInput` for the documentation.
|
298
|
+
"""
|
299
|
+
obj = EmbeddingReqInput(text=prompt, image_data=image_data)
|
300
|
+
generator = self.tokenizer_manager.generate_request(obj, None)
|
301
|
+
return await generator.__anext__()
|
302
|
+
|
276
303
|
def shutdown(self):
|
277
304
|
"""Shutdown the engine"""
|
278
305
|
kill_process_tree(os.getpid(), include_parent=False)
|
@@ -303,7 +330,7 @@ class Engine(EngineBase):
|
|
303
330
|
return {
|
304
331
|
**dataclasses.asdict(self.tokenizer_manager.server_args),
|
305
332
|
**self.scheduler_info,
|
306
|
-
|
333
|
+
"internal_states": internal_states,
|
307
334
|
"version": __version__,
|
308
335
|
}
|
309
336
|
|
@@ -348,8 +375,8 @@ class Engine(EngineBase):
|
|
348
375
|
load_format: Optional[str] = None,
|
349
376
|
flush_cache: bool = True,
|
350
377
|
):
|
351
|
-
"""Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be
|
352
|
-
to avoid duplicated
|
378
|
+
"""Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
|
379
|
+
to avoid duplicated cache cleaning operation."""
|
353
380
|
obj = UpdateWeightsFromTensorReqInput(
|
354
381
|
serialized_named_tensors=[
|
355
382
|
MultiprocessingSerializer.serialize(named_tensors)
|
@@ -459,7 +486,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
459
486
|
if _is_cuda:
|
460
487
|
assert_pkg_version(
|
461
488
|
"sgl-kernel",
|
462
|
-
"0.1.
|
489
|
+
"0.1.2.post1",
|
463
490
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
464
491
|
)
|
465
492
|
|
@@ -42,10 +42,14 @@ from fastapi import FastAPI, File, Form, Request, UploadFile
|
|
42
42
|
from fastapi.middleware.cors import CORSMiddleware
|
43
43
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
44
44
|
|
45
|
-
from sglang.srt.disaggregation.utils import
|
45
|
+
from sglang.srt.disaggregation.utils import (
|
46
|
+
FakeBootstrapHost,
|
47
|
+
register_disaggregation_server,
|
48
|
+
)
|
46
49
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
47
50
|
from sglang.srt.function_call_parser import FunctionCallParser
|
48
51
|
from sglang.srt.managers.io_struct import (
|
52
|
+
AbortReq,
|
49
53
|
CloseSessionReqInput,
|
50
54
|
ConfigureLoggingReq,
|
51
55
|
EmbeddingReqInput,
|
@@ -59,6 +63,7 @@ from sglang.srt.managers.io_struct import (
|
|
59
63
|
ResumeMemoryOccupationReqInput,
|
60
64
|
SeparateReasoningReqInput,
|
61
65
|
SetInternalStateReq,
|
66
|
+
SlowDownReqInput,
|
62
67
|
UpdateWeightFromDiskReqInput,
|
63
68
|
UpdateWeightsFromDistributedReqInput,
|
64
69
|
UpdateWeightsFromTensorReqInput,
|
@@ -217,7 +222,7 @@ async def get_server_info():
|
|
217
222
|
return {
|
218
223
|
**dataclasses.asdict(_global_state.tokenizer_manager.server_args),
|
219
224
|
**_global_state.scheduler_info,
|
220
|
-
|
225
|
+
"internal_states": internal_states,
|
221
226
|
"version": __version__,
|
222
227
|
}
|
223
228
|
|
@@ -333,7 +338,11 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
|
333
338
|
obj = ProfileReqInput()
|
334
339
|
|
335
340
|
await _global_state.tokenizer_manager.start_profile(
|
336
|
-
obj.output_dir,
|
341
|
+
output_dir=obj.output_dir,
|
342
|
+
num_steps=obj.num_steps,
|
343
|
+
activities=obj.activities,
|
344
|
+
with_stack=obj.with_stack,
|
345
|
+
record_shapes=obj.record_shapes,
|
337
346
|
)
|
338
347
|
return Response(
|
339
348
|
content="Start profiling.\n",
|
@@ -491,6 +500,19 @@ async def resume_memory_occupation(
|
|
491
500
|
return _create_error_response(e)
|
492
501
|
|
493
502
|
|
503
|
+
@app.api_route("/slow_down", methods=["GET", "POST"])
|
504
|
+
async def slow_down(obj: SlowDownReqInput, request: Request):
|
505
|
+
"""Slow down the system deliberately. Only for testing. Example scenario:
|
506
|
+
when we want to test performance of D in large-scale PD disaggregation and have no enough nodes for P,
|
507
|
+
we can use this to slow down D to let it have enough running sequences, and then disable slowdown
|
508
|
+
to let it run in full batch size.
|
509
|
+
"""
|
510
|
+
try:
|
511
|
+
await _global_state.tokenizer_manager.slow_down(obj, request)
|
512
|
+
except Exception as e:
|
513
|
+
return _create_error_response(e)
|
514
|
+
|
515
|
+
|
494
516
|
@app.api_route("/open_session", methods=["GET", "POST"])
|
495
517
|
async def open_session(obj: OpenSessionReqInput, request: Request):
|
496
518
|
"""Open a session, and return its unique session id."""
|
@@ -522,6 +544,16 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
|
|
522
544
|
return Response(status_code=200)
|
523
545
|
|
524
546
|
|
547
|
+
@app.post("/abort_request")
|
548
|
+
async def abort_request(obj: AbortReq, request: Request):
|
549
|
+
"""Abort a request."""
|
550
|
+
try:
|
551
|
+
_global_state.tokenizer_manager.abort_request(rid=obj.rid)
|
552
|
+
return Response(status_code=200)
|
553
|
+
except Exception as e:
|
554
|
+
return _create_error_response(e)
|
555
|
+
|
556
|
+
|
525
557
|
@app.post("/parse_function_call")
|
526
558
|
async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
|
527
559
|
"""
|
@@ -675,6 +707,8 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
|
|
675
707
|
**(vertex_req.parameters or {}),
|
676
708
|
)
|
677
709
|
ret = await generate_request(req, raw_request)
|
710
|
+
if isinstance(ret, Response):
|
711
|
+
return ret
|
678
712
|
return ORJSONResponse({"predictions": ret})
|
679
713
|
|
680
714
|
|
@@ -869,5 +903,13 @@ def _wait_and_warmup(
|
|
869
903
|
if server_args.debug_tensor_dump_input_file:
|
870
904
|
kill_process_tree(os.getpid())
|
871
905
|
|
906
|
+
if server_args.pdlb_url is not None:
|
907
|
+
register_disaggregation_server(
|
908
|
+
server_args.disaggregation_mode,
|
909
|
+
server_args.port,
|
910
|
+
server_args.disaggregation_bootstrap_port,
|
911
|
+
server_args.pdlb_url,
|
912
|
+
)
|
913
|
+
|
872
914
|
if launch_callback is not None:
|
873
915
|
launch_callback()
|
@@ -37,6 +37,7 @@ class VerlEngine:
|
|
37
37
|
monkey_patch_torch_reductions()
|
38
38
|
self._device_mesh_cpu = device_mesh_cpu
|
39
39
|
self._tp_rank = device_mesh_cpu.get_local_rank()
|
40
|
+
self._rank = device_mesh_cpu.get_rank()
|
40
41
|
self._tp_size = device_mesh_cpu.size()
|
41
42
|
tp_size_per_node = self._tp_size // nnodes
|
42
43
|
node_rank = self._tp_rank // tp_size_per_node
|
@@ -114,7 +115,7 @@ class VerlEngine:
|
|
114
115
|
# Most naive implementation, can extract tensor and send via gloo if too slow
|
115
116
|
[output] = broadcast_pyobj(
|
116
117
|
data=[output],
|
117
|
-
rank=self.
|
118
|
+
rank=self._rank,
|
118
119
|
dist_group=self._device_mesh_cpu.get_group(),
|
119
120
|
src=self._device_mesh_cpu.mesh[0].item(),
|
120
121
|
force_cpu_device=False,
|
@@ -157,7 +158,7 @@ class VerlEngine:
|
|
157
158
|
)
|
158
159
|
|
159
160
|
if self._tp_rank == 0:
|
160
|
-
self._engine.
|
161
|
+
self._engine.flush_cache()
|
161
162
|
|
162
163
|
def release_memory_occupation(self):
|
163
164
|
if self._tp_rank == 0:
|
@@ -86,8 +86,8 @@ class StructureInfo:
|
|
86
86
|
|
87
87
|
_GetInfoFunc = Callable[[str], StructureInfo]
|
88
88
|
"""
|
89
|
-
|
90
|
-
|
89
|
+
Helper alias of function
|
90
|
+
Usually it is a function that takes a name string and returns a StructureInfo object,
|
91
91
|
which can be used to construct a structural_tag object
|
92
92
|
"""
|
93
93
|
|
@@ -19,6 +19,7 @@ import warnings
|
|
19
19
|
from pathlib import Path
|
20
20
|
from typing import Dict, Optional, Type, Union
|
21
21
|
|
22
|
+
import transformers
|
22
23
|
from huggingface_hub import snapshot_download
|
23
24
|
from transformers import (
|
24
25
|
AutoConfig,
|
@@ -26,6 +27,7 @@ from transformers import (
|
|
26
27
|
AutoTokenizer,
|
27
28
|
PretrainedConfig,
|
28
29
|
PreTrainedTokenizer,
|
30
|
+
PreTrainedTokenizerBase,
|
29
31
|
PreTrainedTokenizerFast,
|
30
32
|
)
|
31
33
|
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
@@ -38,6 +40,7 @@ from sglang.srt.configs import (
|
|
38
40
|
KimiVLConfig,
|
39
41
|
MultiModalityConfig,
|
40
42
|
)
|
43
|
+
from sglang.srt.configs.internvl import InternVLChatConfig
|
41
44
|
from sglang.srt.connector import create_remote_connector
|
42
45
|
from sglang.srt.utils import is_remote_url
|
43
46
|
|
@@ -48,6 +51,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
|
48
51
|
DeepseekVL2Config.model_type: DeepseekVL2Config,
|
49
52
|
MultiModalityConfig.model_type: MultiModalityConfig,
|
50
53
|
KimiVLConfig.model_type: KimiVLConfig,
|
54
|
+
InternVLChatConfig.model_type: InternVLChatConfig,
|
51
55
|
}
|
52
56
|
|
53
57
|
for name, cls in _CONFIG_REGISTRY.items():
|
@@ -90,6 +94,12 @@ def get_config(
|
|
90
94
|
config = config_class.from_pretrained(model, revision=revision)
|
91
95
|
# NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
|
92
96
|
setattr(config, "_name_or_path", model)
|
97
|
+
|
98
|
+
if isinstance(model, str) and config.model_type == "internvl_chat":
|
99
|
+
for key, val in config.llm_config.__dict__.items():
|
100
|
+
if not hasattr(config, key):
|
101
|
+
setattr(config, key, val)
|
102
|
+
|
93
103
|
if model_override_args:
|
94
104
|
config.update(model_override_args)
|
95
105
|
|
@@ -211,6 +221,13 @@ def get_tokenizer(
|
|
211
221
|
return tokenizer
|
212
222
|
|
213
223
|
|
224
|
+
# Some models doesn't have an available processor, e.g.: InternVL
|
225
|
+
def get_tokenizer_from_processor(processor):
|
226
|
+
if isinstance(processor, PreTrainedTokenizerBase):
|
227
|
+
return processor
|
228
|
+
return processor.tokenizer
|
229
|
+
|
230
|
+
|
214
231
|
def get_processor(
|
215
232
|
tokenizer_name: str,
|
216
233
|
*args,
|
@@ -246,7 +263,9 @@ def get_processor(
|
|
246
263
|
**kwargs,
|
247
264
|
)
|
248
265
|
|
249
|
-
|
266
|
+
tokenizer = get_tokenizer_from_processor(processor)
|
267
|
+
|
268
|
+
attach_additional_stop_token_ids(tokenizer)
|
250
269
|
return processor
|
251
270
|
|
252
271
|
|