sglang 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +17 -2
- sglang/bench_serving.py +168 -22
- sglang/srt/configs/internvl.py +4 -2
- sglang/srt/configs/janus_pro.py +1 -1
- sglang/srt/configs/model_config.py +49 -0
- sglang/srt/configs/update_config.py +119 -0
- sglang/srt/conversation.py +35 -0
- sglang/srt/custom_op.py +7 -1
- sglang/srt/disaggregation/base/conn.py +2 -0
- sglang/srt/disaggregation/decode.py +22 -6
- sglang/srt/disaggregation/mooncake/conn.py +289 -48
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
- sglang/srt/disaggregation/nixl/conn.py +100 -52
- sglang/srt/disaggregation/prefill.py +5 -4
- sglang/srt/disaggregation/utils.py +13 -12
- sglang/srt/distributed/parallel_state.py +44 -17
- sglang/srt/entrypoints/EngineBase.py +8 -0
- sglang/srt/entrypoints/engine.py +45 -9
- sglang/srt/entrypoints/http_server.py +111 -24
- sglang/srt/entrypoints/openai/protocol.py +51 -6
- sglang/srt/entrypoints/openai/serving_chat.py +52 -76
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/eplb/__init__.py +0 -0
- sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
- sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
- sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
- sglang/srt/{managers → eplb}/expert_distribution.py +18 -1
- sglang/srt/{managers → eplb}/expert_location.py +1 -1
- sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
- sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/activation.py +7 -0
- sglang/srt/layers/amx_utils.py +86 -0
- sglang/srt/layers/attention/ascend_backend.py +219 -0
- sglang/srt/layers/attention/flashattention_backend.py +56 -23
- sglang/srt/layers/attention/tbo_backend.py +37 -9
- sglang/srt/layers/communicator.py +18 -2
- sglang/srt/layers/dp_attention.py +9 -3
- sglang/srt/layers/elementwise.py +76 -12
- sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
- sglang/srt/layers/layernorm.py +41 -0
- sglang/srt/layers/linear.py +99 -12
- sglang/srt/layers/logits_processor.py +15 -6
- sglang/srt/layers/moe/ep_moe/kernels.py +23 -8
- sglang/srt/layers/moe/ep_moe/layer.py +115 -25
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +42 -19
- sglang/srt/layers/moe/fused_moe_native.py +7 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -4
- sglang/srt/layers/moe/fused_moe_triton/layer.py +129 -10
- sglang/srt/layers/moe/router.py +60 -22
- sglang/srt/layers/moe/topk.py +36 -28
- sglang/srt/layers/parameter.py +67 -7
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
- sglang/srt/layers/quantization/fp8.py +44 -0
- sglang/srt/layers/quantization/fp8_kernel.py +1 -1
- sglang/srt/layers/quantization/fp8_utils.py +6 -6
- sglang/srt/layers/quantization/gptq.py +5 -1
- sglang/srt/layers/quantization/moe_wna16.py +1 -1
- sglang/srt/layers/quantization/quant_utils.py +166 -0
- sglang/srt/layers/quantization/w8a8_int8.py +52 -1
- sglang/srt/layers/rotary_embedding.py +105 -13
- sglang/srt/layers/vocab_parallel_embedding.py +19 -2
- sglang/srt/lora/lora.py +4 -5
- sglang/srt/lora/lora_manager.py +73 -20
- sglang/srt/managers/configure_logging.py +1 -1
- sglang/srt/managers/io_struct.py +60 -15
- sglang/srt/managers/mm_utils.py +73 -59
- sglang/srt/managers/multimodal_processor.py +2 -6
- sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
- sglang/srt/managers/schedule_batch.py +80 -79
- sglang/srt/managers/scheduler.py +153 -63
- sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
- sglang/srt/managers/session_controller.py +12 -3
- sglang/srt/managers/tokenizer_manager.py +314 -103
- sglang/srt/managers/tp_worker.py +13 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
- sglang/srt/mem_cache/allocator.py +290 -0
- sglang/srt/mem_cache/chunk_cache.py +34 -2
- sglang/srt/mem_cache/memory_pool.py +289 -3
- sglang/srt/mem_cache/multimodal_cache.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +3 -2
- sglang/srt/model_executor/forward_batch_info.py +17 -4
- sglang/srt/model_executor/model_runner.py +302 -58
- sglang/srt/model_loader/loader.py +86 -10
- sglang/srt/model_loader/weight_utils.py +160 -3
- sglang/srt/models/deepseek_nextn.py +5 -4
- sglang/srt/models/deepseek_v2.py +305 -26
- sglang/srt/models/deepseek_vl2.py +3 -5
- sglang/srt/models/gemma3_causal.py +1 -2
- sglang/srt/models/gemma3n_audio.py +949 -0
- sglang/srt/models/gemma3n_causal.py +1010 -0
- sglang/srt/models/gemma3n_mm.py +495 -0
- sglang/srt/models/hunyuan.py +771 -0
- sglang/srt/models/kimi_vl.py +1 -2
- sglang/srt/models/llama.py +10 -4
- sglang/srt/models/llama4.py +32 -45
- sglang/srt/models/llama_eagle3.py +61 -11
- sglang/srt/models/llava.py +5 -5
- sglang/srt/models/minicpmo.py +2 -2
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mllama4.py +43 -11
- sglang/srt/models/phi4mm.py +1 -3
- sglang/srt/models/pixtral.py +3 -7
- sglang/srt/models/qwen2.py +31 -3
- sglang/srt/models/qwen2_5_vl.py +1 -3
- sglang/srt/models/qwen2_audio.py +200 -0
- sglang/srt/models/qwen2_moe.py +32 -6
- sglang/srt/models/qwen2_vl.py +1 -4
- sglang/srt/models/qwen3.py +94 -25
- sglang/srt/models/qwen3_moe.py +68 -21
- sglang/srt/models/vila.py +3 -8
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +150 -133
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
- sglang/srt/multimodal/processors/gemma3n.py +82 -0
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +3 -6
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
- sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
- sglang/srt/operations_strategy.py +6 -2
- sglang/srt/reasoning_parser.py +26 -0
- sglang/srt/sampling/sampling_batch_info.py +39 -1
- sglang/srt/server_args.py +85 -24
- sglang/srt/speculative/build_eagle_tree.py +57 -18
- sglang/srt/speculative/eagle_worker.py +6 -4
- sglang/srt/two_batch_overlap.py +204 -28
- sglang/srt/utils.py +369 -138
- sglang/srt/warmup.py +12 -3
- sglang/test/runners.py +10 -1
- sglang/test/test_utils.py +15 -3
- sglang/version.py +1 -1
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/METADATA +9 -6
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/RECORD +149 -137
- sglang/math_utils.py +0 -8
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
- /sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
- /sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
- /sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/WHEEL +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.8.dist-info → sglang-0.4.9.dist-info}/top_level.txt +0 -0
sglang/srt/entrypoints/engine.py
CHANGED
@@ -48,10 +48,12 @@ from sglang.srt.managers.io_struct import (
|
|
48
48
|
GetWeightsByNameReqInput,
|
49
49
|
ImageDataItem,
|
50
50
|
InitWeightsUpdateGroupReqInput,
|
51
|
+
LoadLoRAAdapterReqInput,
|
51
52
|
ReleaseMemoryOccupationReqInput,
|
52
53
|
ResumeMemoryOccupationReqInput,
|
53
54
|
RpcReqInput,
|
54
55
|
RpcReqOutput,
|
56
|
+
UnloadLoRAAdapterReqInput,
|
55
57
|
UpdateWeightFromDiskReqInput,
|
56
58
|
UpdateWeightsFromDistributedReqInput,
|
57
59
|
UpdateWeightsFromTensorReqInput,
|
@@ -115,13 +117,13 @@ class Engine(EngineBase):
|
|
115
117
|
atexit.register(self.shutdown)
|
116
118
|
|
117
119
|
# Allocate ports for inter-process communications
|
118
|
-
port_args = PortArgs.init_new(server_args)
|
120
|
+
self.port_args = PortArgs.init_new(server_args)
|
119
121
|
logger.info(f"{server_args=}")
|
120
122
|
|
121
123
|
# Launch subprocesses
|
122
124
|
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
123
125
|
server_args=server_args,
|
124
|
-
port_args=port_args,
|
126
|
+
port_args=self.port_args,
|
125
127
|
)
|
126
128
|
self.server_args = server_args
|
127
129
|
self.tokenizer_manager = tokenizer_manager
|
@@ -130,7 +132,7 @@ class Engine(EngineBase):
|
|
130
132
|
|
131
133
|
context = zmq.Context(2)
|
132
134
|
self.send_to_rpc = get_zmq_socket(
|
133
|
-
context, zmq.DEALER, port_args.rpc_ipc_name, True
|
135
|
+
context, zmq.DEALER, self.port_args.rpc_ipc_name, True
|
134
136
|
)
|
135
137
|
|
136
138
|
def generate(
|
@@ -242,6 +244,7 @@ class Engine(EngineBase):
|
|
242
244
|
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
|
243
245
|
lora_path: Optional[List[Optional[str]]] = None,
|
244
246
|
custom_logit_processor: Optional[Union[List[str], str]] = None,
|
247
|
+
return_hidden_states: bool = False,
|
245
248
|
stream: bool = False,
|
246
249
|
bootstrap_host: Optional[Union[List[str], str]] = None,
|
247
250
|
bootstrap_port: Optional[Union[List[int], int]] = None,
|
@@ -274,6 +277,7 @@ class Engine(EngineBase):
|
|
274
277
|
top_logprobs_num=top_logprobs_num,
|
275
278
|
token_ids_logprob=token_ids_logprob,
|
276
279
|
lora_path=lora_path,
|
280
|
+
return_hidden_states=return_hidden_states,
|
277
281
|
stream=stream,
|
278
282
|
custom_logit_processor=custom_logit_processor,
|
279
283
|
bootstrap_host=bootstrap_host,
|
@@ -414,12 +418,21 @@ class Engine(EngineBase):
|
|
414
418
|
self.tokenizer_manager.init_weights_update_group(obj, None)
|
415
419
|
)
|
416
420
|
|
417
|
-
def update_weights_from_distributed(
|
421
|
+
def update_weights_from_distributed(
|
422
|
+
self,
|
423
|
+
names: list[str],
|
424
|
+
dtypes: list[str],
|
425
|
+
shapes: list[list[int]],
|
426
|
+
group_name: str = "weight_update_group",
|
427
|
+
flush_cache: bool = True,
|
428
|
+
):
|
418
429
|
"""Update weights from distributed source."""
|
419
430
|
obj = UpdateWeightsFromDistributedReqInput(
|
420
|
-
|
421
|
-
|
422
|
-
|
431
|
+
names=names,
|
432
|
+
dtypes=dtypes,
|
433
|
+
shapes=shapes,
|
434
|
+
group_name=group_name,
|
435
|
+
flush_cache=flush_cache,
|
423
436
|
)
|
424
437
|
loop = asyncio.get_event_loop()
|
425
438
|
return loop.run_until_complete(
|
@@ -476,6 +489,29 @@ class Engine(EngineBase):
|
|
476
489
|
self.tokenizer_manager.get_weights_by_name(obj, None)
|
477
490
|
)
|
478
491
|
|
492
|
+
def load_lora_adapter(self, lora_name: str, lora_path: str):
|
493
|
+
"""Load a new LoRA adapter without re-launching the engine."""
|
494
|
+
|
495
|
+
obj = LoadLoRAAdapterReqInput(
|
496
|
+
lora_name=lora_name,
|
497
|
+
lora_path=lora_path,
|
498
|
+
)
|
499
|
+
|
500
|
+
loop = asyncio.get_event_loop()
|
501
|
+
return loop.run_until_complete(
|
502
|
+
self.tokenizer_manager.load_lora_adapter(obj, None)
|
503
|
+
)
|
504
|
+
|
505
|
+
def unload_lora_adapter(self, lora_name: str):
|
506
|
+
"""Unload a LoRA adapter without re-launching the engine."""
|
507
|
+
|
508
|
+
obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
|
509
|
+
|
510
|
+
loop = asyncio.get_event_loop()
|
511
|
+
return loop.run_until_complete(
|
512
|
+
self.tokenizer_manager.unload_lora_adapter(obj, None)
|
513
|
+
)
|
514
|
+
|
479
515
|
def release_memory_occupation(self, tags: Optional[List[str]] = None):
|
480
516
|
obj = ReleaseMemoryOccupationReqInput(tags=tags)
|
481
517
|
loop = asyncio.get_event_loop()
|
@@ -606,7 +642,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
606
642
|
if server_args.attention_backend == "flashinfer":
|
607
643
|
assert_pkg_version(
|
608
644
|
"flashinfer_python",
|
609
|
-
"0.2.
|
645
|
+
"0.2.7.post1",
|
610
646
|
"Please uninstall the old version and "
|
611
647
|
"reinstall the latest version by following the instructions "
|
612
648
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -614,7 +650,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
614
650
|
if _is_cuda:
|
615
651
|
assert_pkg_version(
|
616
652
|
"sgl-kernel",
|
617
|
-
"0.
|
653
|
+
"0.2.4",
|
618
654
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
619
655
|
)
|
620
656
|
|
@@ -72,6 +72,7 @@ from sglang.srt.managers.io_struct import (
|
|
72
72
|
GenerateReqInput,
|
73
73
|
GetWeightsByNameReqInput,
|
74
74
|
InitWeightsUpdateGroupReqInput,
|
75
|
+
LoadLoRAAdapterReqInput,
|
75
76
|
OpenSessionReqInput,
|
76
77
|
ParseFunctionCallReq,
|
77
78
|
ProfileReqInput,
|
@@ -80,6 +81,7 @@ from sglang.srt.managers.io_struct import (
|
|
80
81
|
SeparateReasoningReqInput,
|
81
82
|
SetInternalStateReq,
|
82
83
|
SlowDownReqInput,
|
84
|
+
UnloadLoRAAdapterReqInput,
|
83
85
|
UpdateWeightFromDiskReqInput,
|
84
86
|
UpdateWeightsFromDistributedReqInput,
|
85
87
|
UpdateWeightsFromTensorReqInput,
|
@@ -124,8 +126,6 @@ def set_global_state(global_state: _GlobalState):
|
|
124
126
|
|
125
127
|
@asynccontextmanager
|
126
128
|
async def lifespan(fast_api_app: FastAPI):
|
127
|
-
server_args: ServerArgs = fast_api_app.server_args
|
128
|
-
|
129
129
|
# Initialize OpenAI serving handlers
|
130
130
|
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
131
131
|
_global_state.tokenizer_manager, _global_state.template_manager
|
@@ -143,9 +143,12 @@ async def lifespan(fast_api_app: FastAPI):
|
|
143
143
|
_global_state.tokenizer_manager
|
144
144
|
)
|
145
145
|
|
146
|
+
server_args: ServerArgs = fast_api_app.server_args
|
146
147
|
if server_args.warmups is not None:
|
147
148
|
await execute_warmups(
|
148
|
-
server_args.
|
149
|
+
server_args.disaggregation_mode,
|
150
|
+
server_args.warmups.split(","),
|
151
|
+
_global_state.tokenizer_manager,
|
149
152
|
)
|
150
153
|
logger.info("Warmup ended")
|
151
154
|
|
@@ -278,13 +281,17 @@ async def get_model_info():
|
|
278
281
|
"model_path": _global_state.tokenizer_manager.model_path,
|
279
282
|
"tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
|
280
283
|
"is_generation": _global_state.tokenizer_manager.is_generation,
|
284
|
+
"preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
|
281
285
|
}
|
282
286
|
return result
|
283
287
|
|
284
288
|
|
285
289
|
@app.get("/get_server_info")
|
286
290
|
async def get_server_info():
|
287
|
-
|
291
|
+
# Returns interna states per DP.
|
292
|
+
internal_states: List[Dict[Any, Any]] = (
|
293
|
+
await _global_state.tokenizer_manager.get_internal_state()
|
294
|
+
)
|
288
295
|
return {
|
289
296
|
**dataclasses.asdict(_global_state.tokenizer_manager.server_args),
|
290
297
|
**_global_state.scheduler_info,
|
@@ -298,6 +305,8 @@ async def get_load():
|
|
298
305
|
return await _global_state.tokenizer_manager.get_load()
|
299
306
|
|
300
307
|
|
308
|
+
# example usage:
|
309
|
+
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
|
301
310
|
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
302
311
|
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
303
312
|
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
@@ -351,8 +360,7 @@ async def generate_from_file_request(file: UploadFile, request: Request):
|
|
351
360
|
obj = GenerateReqInput(
|
352
361
|
input_embeds=input_embeds,
|
353
362
|
sampling_params={
|
354
|
-
"
|
355
|
-
"temperature": 0.2,
|
363
|
+
"temperature": 0.0,
|
356
364
|
"max_new_tokens": 512,
|
357
365
|
},
|
358
366
|
)
|
@@ -391,16 +399,6 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
|
|
391
399
|
return _create_error_response(e)
|
392
400
|
|
393
401
|
|
394
|
-
@app.api_route(
|
395
|
-
"/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
|
396
|
-
)
|
397
|
-
async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
|
398
|
-
"""Endpoint for reranking documents based on query relevance."""
|
399
|
-
return await raw_request.app.state.openai_serving_rerank.handle_request(
|
400
|
-
request, raw_request
|
401
|
-
)
|
402
|
-
|
403
|
-
|
404
402
|
@app.api_route("/flush_cache", methods=["GET", "POST"])
|
405
403
|
async def flush_cache():
|
406
404
|
"""Flush the radix cache."""
|
@@ -595,6 +593,40 @@ async def slow_down(obj: SlowDownReqInput, request: Request):
|
|
595
593
|
return _create_error_response(e)
|
596
594
|
|
597
595
|
|
596
|
+
@app.api_route("/load_lora_adapter", methods=["POST"])
|
597
|
+
async def load_lora_adapter(obj: LoadLoRAAdapterReqInput, request: Request):
|
598
|
+
"""Load a new LoRA adapter without re-launching the server."""
|
599
|
+
result = await _global_state.tokenizer_manager.load_lora_adapter(obj, request)
|
600
|
+
|
601
|
+
if result.success:
|
602
|
+
return ORJSONResponse(
|
603
|
+
result,
|
604
|
+
status_code=HTTPStatus.OK,
|
605
|
+
)
|
606
|
+
else:
|
607
|
+
return ORJSONResponse(
|
608
|
+
result,
|
609
|
+
status_code=HTTPStatus.BAD_REQUEST,
|
610
|
+
)
|
611
|
+
|
612
|
+
|
613
|
+
@app.api_route("/unload_lora_adapter", methods=["POST"])
|
614
|
+
async def unload_lora_adapter(obj: UnloadLoRAAdapterReqInput, request: Request):
|
615
|
+
"""Load a new LoRA adapter without re-launching the server."""
|
616
|
+
result = await _global_state.tokenizer_manager.unload_lora_adapter(obj, request)
|
617
|
+
|
618
|
+
if result.success:
|
619
|
+
return ORJSONResponse(
|
620
|
+
result,
|
621
|
+
status_code=HTTPStatus.OK,
|
622
|
+
)
|
623
|
+
else:
|
624
|
+
return ORJSONResponse(
|
625
|
+
result,
|
626
|
+
status_code=HTTPStatus.BAD_REQUEST,
|
627
|
+
)
|
628
|
+
|
629
|
+
|
598
630
|
@app.api_route("/open_session", methods=["GET", "POST"])
|
599
631
|
async def open_session(obj: OpenSessionReqInput, request: Request):
|
600
632
|
"""Open a session, and return its unique session id."""
|
@@ -630,7 +662,9 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
|
|
630
662
|
async def abort_request(obj: AbortReq, request: Request):
|
631
663
|
"""Abort a request."""
|
632
664
|
try:
|
633
|
-
_global_state.tokenizer_manager.abort_request(
|
665
|
+
_global_state.tokenizer_manager.abort_request(
|
666
|
+
rid=obj.rid, abort_all=obj.abort_all
|
667
|
+
)
|
634
668
|
return Response(status_code=200)
|
635
669
|
except Exception as e:
|
636
670
|
return _create_error_response(e)
|
@@ -678,6 +712,26 @@ async def separate_reasoning_request(obj: SeparateReasoningReqInput, request: Re
|
|
678
712
|
return ORJSONResponse(content=response_data, status_code=200)
|
679
713
|
|
680
714
|
|
715
|
+
@app.post("/pause_generation")
|
716
|
+
async def pause_generation(request: Request):
|
717
|
+
"""Pause generation."""
|
718
|
+
await _global_state.tokenizer_manager.pause_generation()
|
719
|
+
return ORJSONResponse(
|
720
|
+
content={"message": "Generation paused successfully.", "status": "ok"},
|
721
|
+
status_code=200,
|
722
|
+
)
|
723
|
+
|
724
|
+
|
725
|
+
@app.post("/continue_generation")
|
726
|
+
async def continue_generation(request: Request):
|
727
|
+
"""Continue generation."""
|
728
|
+
await _global_state.tokenizer_manager.continue_generation()
|
729
|
+
return ORJSONResponse(
|
730
|
+
content={"message": "Generation continued successfully.", "status": "ok"},
|
731
|
+
status_code=200,
|
732
|
+
)
|
733
|
+
|
734
|
+
|
681
735
|
##### OpenAI-compatible API endpoints #####
|
682
736
|
|
683
737
|
|
@@ -805,6 +859,16 @@ async def v1_score_request(request: ScoringRequest, raw_request: Request):
|
|
805
859
|
)
|
806
860
|
|
807
861
|
|
862
|
+
@app.api_route(
|
863
|
+
"/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
|
864
|
+
)
|
865
|
+
async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
|
866
|
+
"""Endpoint for reranking documents based on query relevance."""
|
867
|
+
return await raw_request.app.state.openai_serving_rerank.handle_request(
|
868
|
+
request, raw_request
|
869
|
+
)
|
870
|
+
|
871
|
+
|
808
872
|
def _create_error_response(e):
|
809
873
|
return ORJSONResponse(
|
810
874
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
@@ -851,6 +915,15 @@ def launch_server(
|
|
851
915
|
add_prometheus_middleware(app)
|
852
916
|
enable_func_timer()
|
853
917
|
|
918
|
+
image_token_text = None
|
919
|
+
if (
|
920
|
+
tokenizer_manager.image_token_id is not None
|
921
|
+
and not server_args.skip_tokenizer_init
|
922
|
+
):
|
923
|
+
image_token_text = tokenizer_manager.tokenizer.decode(
|
924
|
+
[tokenizer_manager.image_token_id]
|
925
|
+
)
|
926
|
+
|
854
927
|
# Send a warmup request - we will create the thread launch it
|
855
928
|
# in the lifespan after all other warmups have fired.
|
856
929
|
warmup_thread = threading.Thread(
|
@@ -858,7 +931,7 @@ def launch_server(
|
|
858
931
|
args=(
|
859
932
|
server_args,
|
860
933
|
pipe_finish_writer,
|
861
|
-
|
934
|
+
image_token_text,
|
862
935
|
launch_callback,
|
863
936
|
),
|
864
937
|
)
|
@@ -881,11 +954,9 @@ def launch_server(
|
|
881
954
|
warmup_thread.join()
|
882
955
|
|
883
956
|
|
884
|
-
def
|
957
|
+
def _execute_server_warmup(
|
885
958
|
server_args: ServerArgs,
|
886
959
|
pipe_finish_writer: Optional[multiprocessing.connection.Connection],
|
887
|
-
image_token_text: str,
|
888
|
-
launch_callback: Optional[Callable[[], None]] = None,
|
889
960
|
):
|
890
961
|
headers = {}
|
891
962
|
url = server_args.url()
|
@@ -910,7 +981,7 @@ def _wait_and_warmup(
|
|
910
981
|
pipe_finish_writer.send(last_traceback)
|
911
982
|
logger.error(f"Initialization failed. warmup error: {last_traceback}")
|
912
983
|
kill_process_tree(os.getpid())
|
913
|
-
return
|
984
|
+
return success
|
914
985
|
|
915
986
|
model_info = res.json()
|
916
987
|
|
@@ -984,12 +1055,28 @@ def _wait_and_warmup(
|
|
984
1055
|
pipe_finish_writer.send(last_traceback)
|
985
1056
|
logger.error(f"Initialization failed. warmup error: {last_traceback}")
|
986
1057
|
kill_process_tree(os.getpid())
|
987
|
-
return
|
1058
|
+
return False
|
988
1059
|
|
989
1060
|
# Debug print
|
990
|
-
# logger.info(f"{res.json()=}")
|
1061
|
+
# logger.info(f"warmup request returns: {res.json()=}")
|
1062
|
+
return success
|
1063
|
+
|
1064
|
+
|
1065
|
+
def _wait_and_warmup(
|
1066
|
+
server_args: ServerArgs,
|
1067
|
+
pipe_finish_writer: Optional[multiprocessing.connection.Connection],
|
1068
|
+
image_token_text: str,
|
1069
|
+
launch_callback: Optional[Callable[[], None]] = None,
|
1070
|
+
):
|
1071
|
+
if not server_args.skip_server_warmup:
|
1072
|
+
if not _execute_server_warmup(
|
1073
|
+
server_args,
|
1074
|
+
pipe_finish_writer,
|
1075
|
+
):
|
1076
|
+
return
|
991
1077
|
|
992
1078
|
logger.info("The server is fired up and ready to roll!")
|
1079
|
+
|
993
1080
|
if pipe_finish_writer is not None:
|
994
1081
|
pipe_finish_writer.send("ready")
|
995
1082
|
|
@@ -14,7 +14,8 @@
|
|
14
14
|
"""Pydantic models for OpenAI API protocol"""
|
15
15
|
|
16
16
|
import time
|
17
|
-
from
|
17
|
+
from dataclasses import dataclass
|
18
|
+
from typing import Any, Dict, List, Optional, Union
|
18
19
|
|
19
20
|
from pydantic import (
|
20
21
|
BaseModel,
|
@@ -195,6 +196,9 @@ class CompletionRequest(BaseModel):
|
|
195
196
|
bootstrap_port: Optional[int] = None
|
196
197
|
bootstrap_room: Optional[int] = None
|
197
198
|
|
199
|
+
# For request id
|
200
|
+
rid: Optional[Union[List[str], str]] = None
|
201
|
+
|
198
202
|
@field_validator("max_tokens")
|
199
203
|
@classmethod
|
200
204
|
def validate_max_tokens_positive(cls, v):
|
@@ -232,7 +236,7 @@ class CompletionResponseStreamChoice(BaseModel):
|
|
232
236
|
index: int
|
233
237
|
text: str
|
234
238
|
logprobs: Optional[LogProbs] = None
|
235
|
-
finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
|
239
|
+
finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
|
236
240
|
matched_stop: Union[None, int, str] = None
|
237
241
|
hidden_states: Optional[object] = None
|
238
242
|
|
@@ -309,6 +313,18 @@ class ChatCompletionMessageGenericParam(BaseModel):
|
|
309
313
|
reasoning_content: Optional[str] = None
|
310
314
|
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
311
315
|
|
316
|
+
@field_validator("role", mode="before")
|
317
|
+
@classmethod
|
318
|
+
def _normalize_role(cls, v):
|
319
|
+
if isinstance(v, str):
|
320
|
+
v_lower = v.lower()
|
321
|
+
if v_lower not in {"system", "assistant", "tool"}:
|
322
|
+
raise ValueError(
|
323
|
+
"'role' must be one of 'system', 'assistant', or 'tool' (case-insensitive)."
|
324
|
+
)
|
325
|
+
return v_lower
|
326
|
+
raise ValueError("'role' must be a string")
|
327
|
+
|
312
328
|
|
313
329
|
class ChatCompletionMessageUserParam(BaseModel):
|
314
330
|
role: Literal["user"]
|
@@ -429,8 +445,8 @@ class ChatCompletionRequest(BaseModel):
|
|
429
445
|
stream_reasoning: bool = True
|
430
446
|
chat_template_kwargs: Optional[Dict] = None
|
431
447
|
|
432
|
-
#
|
433
|
-
rid: Optional[str] = None
|
448
|
+
# For request id
|
449
|
+
rid: Optional[Union[List[str], str]] = None
|
434
450
|
|
435
451
|
# For PD disaggregation
|
436
452
|
bootstrap_host: Optional[str] = None
|
@@ -494,7 +510,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
494
510
|
delta: DeltaMessage
|
495
511
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
496
512
|
finish_reason: Optional[
|
497
|
-
Literal[
|
513
|
+
Literal[
|
514
|
+
"stop", "length", "tool_calls", "content_filter", "function_call", "abort"
|
515
|
+
]
|
498
516
|
] = None
|
499
517
|
matched_stop: Union[None, int, str] = None
|
500
518
|
|
@@ -528,7 +546,7 @@ class EmbeddingRequest(BaseModel):
|
|
528
546
|
user: Optional[str] = None
|
529
547
|
|
530
548
|
# The request id.
|
531
|
-
rid: Optional[str] = None
|
549
|
+
rid: Optional[Union[List[str], str]] = None
|
532
550
|
|
533
551
|
|
534
552
|
class EmbeddingObject(BaseModel):
|
@@ -587,3 +605,30 @@ OpenAIServingRequest = Union[
|
|
587
605
|
ScoringRequest,
|
588
606
|
V1RerankReqInput,
|
589
607
|
]
|
608
|
+
|
609
|
+
|
610
|
+
@dataclass
|
611
|
+
class MessageProcessingResult:
|
612
|
+
"""Result of processing chat messages and applying templates.
|
613
|
+
|
614
|
+
This dataclass encapsulates all the outputs from message processing including
|
615
|
+
prompt generation, multimodal data extraction, and constraint preparation.
|
616
|
+
Used internally by OpenAIServingChat to pass processed data between methods.
|
617
|
+
|
618
|
+
Args:
|
619
|
+
prompt: The final text prompt after applying chat template
|
620
|
+
prompt_ids: Either the text prompt (str) or tokenized IDs (List[int])
|
621
|
+
image_data: Extracted image data from messages, if any
|
622
|
+
audio_data: Extracted audio data from messages, if any
|
623
|
+
modalities: List of modality types present in the messages
|
624
|
+
stop: Combined stop strings from template and request
|
625
|
+
tool_call_constraint: Optional constraint for structured tool calls
|
626
|
+
"""
|
627
|
+
|
628
|
+
prompt: str
|
629
|
+
prompt_ids: Union[str, List[int]]
|
630
|
+
image_data: Optional[Any]
|
631
|
+
audio_data: Optional[Any]
|
632
|
+
modalities: List[str]
|
633
|
+
stop: List[str]
|
634
|
+
tool_call_constraint: Optional[Any] = None
|