sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -7
- sglang/bench_one_batch_server.py +7 -2
- sglang/bench_serving.py +3 -3
- sglang/eval/llama3_eval.py +0 -1
- sglang/srt/configs/model_config.py +25 -9
- sglang/srt/configs/update_config.py +40 -5
- sglang/srt/constrained/xgrammar_backend.py +23 -11
- sglang/srt/conversation.py +2 -15
- sglang/srt/disaggregation/ascend/conn.py +1 -3
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +1 -2
- sglang/srt/disaggregation/launch_lb.py +7 -1
- sglang/srt/disaggregation/mini_lb.py +11 -5
- sglang/srt/disaggregation/mooncake/conn.py +141 -47
- sglang/srt/disaggregation/prefill.py +261 -5
- sglang/srt/disaggregation/utils.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/device_communicators/pynccl.py +68 -18
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
- sglang/srt/distributed/naive_distributed.py +112 -0
- sglang/srt/distributed/parallel_state.py +90 -4
- sglang/srt/entrypoints/context.py +20 -1
- sglang/srt/entrypoints/engine.py +29 -4
- sglang/srt/entrypoints/http_server.py +76 -0
- sglang/srt/entrypoints/openai/protocol.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +23 -6
- sglang/srt/entrypoints/openai/serving_completions.py +10 -1
- sglang/srt/entrypoints/openai/serving_responses.py +2 -2
- sglang/srt/eplb/expert_distribution.py +2 -3
- sglang/srt/function_call/deepseekv3_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +24 -0
- sglang/srt/host_shared_memory.py +83 -0
- sglang/srt/layers/attention/ascend_backend.py +132 -22
- sglang/srt/layers/attention/flashattention_backend.py +24 -17
- sglang/srt/layers/attention/flashinfer_backend.py +14 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
- sglang/srt/layers/attention/triton_backend.py +109 -73
- sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
- sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
- sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
- sglang/srt/layers/attention/utils.py +94 -15
- sglang/srt/layers/attention/vision.py +40 -13
- sglang/srt/layers/attention/vision_utils.py +65 -0
- sglang/srt/layers/communicator.py +58 -10
- sglang/srt/layers/dp_attention.py +137 -27
- sglang/srt/layers/elementwise.py +94 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
- sglang/srt/layers/layernorm.py +8 -1
- sglang/srt/layers/linear.py +24 -0
- sglang/srt/layers/logits_processor.py +16 -18
- sglang/srt/layers/moe/__init__.py +31 -0
- sglang/srt/layers/moe/ep_moe/layer.py +37 -33
- sglang/srt/layers/moe/fused_moe_native.py +14 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
- sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
- sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
- sglang/srt/layers/moe/moe_runner/base.py +13 -0
- sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
- sglang/srt/layers/moe/router.py +15 -9
- sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
- sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
- sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
- sglang/srt/layers/moe/topk.py +167 -83
- sglang/srt/layers/moe/utils.py +159 -18
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +18 -46
- sglang/srt/layers/quantization/awq.py +22 -23
- sglang/srt/layers/quantization/base_config.py +2 -6
- sglang/srt/layers/quantization/blockwise_int8.py +4 -12
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
- sglang/srt/layers/quantization/fp8.py +127 -119
- sglang/srt/layers/quantization/fp8_kernel.py +195 -24
- sglang/srt/layers/quantization/fp8_utils.py +34 -9
- sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
- sglang/srt/layers/quantization/gptq.py +17 -21
- sglang/srt/layers/quantization/marlin_utils.py +26 -8
- sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
- sglang/srt/layers/quantization/modelopt_quant.py +217 -98
- sglang/srt/layers/quantization/moe_wna16.py +10 -15
- sglang/srt/layers/quantization/mxfp4.py +222 -39
- sglang/srt/layers/quantization/quark/quark.py +390 -0
- sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
- sglang/srt/layers/quantization/unquant.py +34 -70
- sglang/srt/layers/quantization/utils.py +77 -2
- sglang/srt/layers/quantization/w4afp8.py +7 -8
- sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
- sglang/srt/layers/quantization/w8a8_int8.py +5 -13
- sglang/srt/layers/radix_attention.py +6 -0
- sglang/srt/layers/rotary_embedding.py +1 -0
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/layers.py +6 -2
- sglang/srt/lora/lora_manager.py +21 -22
- sglang/srt/lora/lora_registry.py +3 -3
- sglang/srt/lora/mem_pool.py +26 -24
- sglang/srt/lora/utils.py +10 -12
- sglang/srt/managers/cache_controller.py +80 -19
- sglang/srt/managers/detokenizer_manager.py +10 -2
- sglang/srt/managers/io_struct.py +23 -0
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/schedule_batch.py +22 -48
- sglang/srt/managers/scheduler.py +28 -20
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/template_manager.py +7 -5
- sglang/srt/managers/tokenizer_manager.py +88 -39
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/managers/utils.py +59 -1
- sglang/srt/mem_cache/allocator.py +10 -157
- sglang/srt/mem_cache/allocator_ascend.py +147 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +14 -4
- sglang/srt/mem_cache/memory_pool.py +3 -3
- sglang/srt/mem_cache/memory_pool_host.py +35 -2
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
- sglang/srt/model_executor/cuda_graph_runner.py +33 -33
- sglang/srt/model_executor/forward_batch_info.py +11 -10
- sglang/srt/model_executor/model_runner.py +93 -78
- sglang/srt/model_executor/npu_graph_runner.py +94 -0
- sglang/srt/model_loader/loader.py +24 -6
- sglang/srt/models/dbrx.py +12 -6
- sglang/srt/models/deepseek.py +2 -1
- sglang/srt/models/deepseek_nextn.py +5 -2
- sglang/srt/models/deepseek_v2.py +226 -223
- sglang/srt/models/ernie4.py +2 -2
- sglang/srt/models/glm4_moe.py +27 -65
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +52 -1
- sglang/srt/models/glm4v_moe.py +8 -11
- sglang/srt/models/gpt_oss.py +41 -76
- sglang/srt/models/granitemoe.py +0 -1
- sglang/srt/models/grok.py +376 -48
- sglang/srt/models/interns1.py +12 -47
- sglang/srt/models/internvl.py +6 -51
- sglang/srt/models/llama.py +10 -2
- sglang/srt/models/llama4.py +18 -7
- sglang/srt/models/minicpm3.py +0 -1
- sglang/srt/models/mixtral.py +0 -2
- sglang/srt/models/nemotron_nas.py +435 -0
- sglang/srt/models/olmoe.py +0 -1
- sglang/srt/models/phi4mm.py +3 -21
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +2 -0
- sglang/srt/models/qwen2_moe.py +23 -23
- sglang/srt/models/qwen3.py +2 -2
- sglang/srt/models/qwen3_classification.py +84 -0
- sglang/srt/models/qwen3_moe.py +27 -43
- sglang/srt/models/step3_vl.py +8 -3
- sglang/srt/models/xverse_moe.py +11 -5
- sglang/srt/multimodal/processors/base_processor.py +3 -3
- sglang/srt/multimodal/processors/internvl.py +7 -2
- sglang/srt/multimodal/processors/llava.py +11 -7
- sglang/srt/offloader.py +433 -0
- sglang/srt/operations.py +22 -2
- sglang/srt/reasoning_parser.py +4 -3
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +264 -105
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_utils.py +36 -13
- sglang/srt/speculative/eagle_worker.py +56 -3
- sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
- sglang/srt/two_batch_overlap.py +20 -19
- sglang/srt/utils.py +68 -70
- sglang/test/runners.py +8 -5
- sglang/test/test_block_fp8.py +5 -6
- sglang/test/test_block_fp8_ep.py +13 -19
- sglang/test/test_cutlass_moe.py +4 -6
- sglang/test/test_cutlass_w4a8_moe.py +4 -3
- sglang/test/test_fp4_moe.py +4 -3
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/test/test_utils.py +7 -0
- sglang/utils.py +0 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
- sglang/srt/layers/quantization/fp4.py +0 -557
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
@@ -88,6 +88,7 @@ from sglang.srt.managers.io_struct import (
|
|
88
88
|
UpdateWeightFromDiskReqInput,
|
89
89
|
UpdateWeightsFromDistributedReqInput,
|
90
90
|
UpdateWeightsFromTensorReqInput,
|
91
|
+
UpdateWeightVersionReqInput,
|
91
92
|
VertexGenerateReqInput,
|
92
93
|
)
|
93
94
|
from sglang.srt.managers.template_manager import TemplateManager
|
@@ -342,10 +343,19 @@ async def get_model_info():
|
|
342
343
|
"tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
|
343
344
|
"is_generation": _global_state.tokenizer_manager.is_generation,
|
344
345
|
"preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
|
346
|
+
"weight_version": _global_state.tokenizer_manager.server_args.weight_version,
|
345
347
|
}
|
346
348
|
return result
|
347
349
|
|
348
350
|
|
351
|
+
@app.get("/get_weight_version")
|
352
|
+
async def get_weight_version():
|
353
|
+
"""Get the current weight version."""
|
354
|
+
return {
|
355
|
+
"weight_version": _global_state.tokenizer_manager.server_args.weight_version
|
356
|
+
}
|
357
|
+
|
358
|
+
|
349
359
|
@app.get("/get_server_info")
|
350
360
|
async def get_server_info():
|
351
361
|
# Returns interna states per DP.
|
@@ -501,6 +511,18 @@ async def stop_profile_async():
|
|
501
511
|
)
|
502
512
|
|
503
513
|
|
514
|
+
@app.api_route("/freeze_gc", methods=["GET", "POST"])
|
515
|
+
async def freeze_gc_async():
|
516
|
+
"""
|
517
|
+
See engine.freeze_gc for more details.
|
518
|
+
"""
|
519
|
+
await _global_state.tokenizer_manager.freeze_gc()
|
520
|
+
return Response(
|
521
|
+
content="Garbage collection frozen.\n",
|
522
|
+
status_code=200,
|
523
|
+
)
|
524
|
+
|
525
|
+
|
504
526
|
@app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
|
505
527
|
async def start_expert_distribution_record_async():
|
506
528
|
"""Start recording the expert distribution. Clear the previous record if any."""
|
@@ -537,6 +559,12 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
537
559
|
success, message, num_paused_requests = (
|
538
560
|
await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
|
539
561
|
)
|
562
|
+
|
563
|
+
# Update weight version if provided and weights update was successful
|
564
|
+
if success and obj.weight_version is not None:
|
565
|
+
_update_weight_version_if_provided(obj.weight_version)
|
566
|
+
message += f" Weight version updated to {obj.weight_version}."
|
567
|
+
|
540
568
|
content = {
|
541
569
|
"success": success,
|
542
570
|
"message": message,
|
@@ -583,6 +611,12 @@ async def update_weights_from_tensor(
|
|
583
611
|
success, message = await _global_state.tokenizer_manager.update_weights_from_tensor(
|
584
612
|
obj, request
|
585
613
|
)
|
614
|
+
|
615
|
+
# Update weight version if provided and weights update was successful
|
616
|
+
if success and obj.weight_version is not None:
|
617
|
+
_update_weight_version_if_provided(obj.weight_version)
|
618
|
+
message += f" Weight version updated to {obj.weight_version}."
|
619
|
+
|
586
620
|
content = {"success": success, "message": message}
|
587
621
|
return ORJSONResponse(
|
588
622
|
content, status_code=200 if success else HTTPStatus.BAD_REQUEST
|
@@ -599,6 +633,12 @@ async def update_weights_from_distributed(
|
|
599
633
|
obj, request
|
600
634
|
)
|
601
635
|
)
|
636
|
+
|
637
|
+
# Update weight version if provided and weights update was successful
|
638
|
+
if success and obj.weight_version is not None:
|
639
|
+
_update_weight_version_if_provided(obj.weight_version)
|
640
|
+
message += f" Weight version updated to {obj.weight_version}."
|
641
|
+
|
602
642
|
content = {"success": success, "message": message}
|
603
643
|
if success:
|
604
644
|
return ORJSONResponse(content, status_code=200)
|
@@ -606,6 +646,36 @@ async def update_weights_from_distributed(
|
|
606
646
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
607
647
|
|
608
648
|
|
649
|
+
@app.post("/update_weight_version")
|
650
|
+
async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
|
651
|
+
"""Update the weight version. This operation requires no active requests."""
|
652
|
+
if obj.abort_all_requests:
|
653
|
+
_global_state.tokenizer_manager.abort_request(abort_all=True)
|
654
|
+
|
655
|
+
# Use a simple approach without the complex lock mechanism for now
|
656
|
+
# since weight_version update is a simple operation that doesn't affect model weights
|
657
|
+
try:
|
658
|
+
# Update the weight version in server args (the single source of truth)
|
659
|
+
_global_state.tokenizer_manager.server_args.weight_version = obj.new_version
|
660
|
+
|
661
|
+
return ORJSONResponse(
|
662
|
+
{
|
663
|
+
"success": True,
|
664
|
+
"message": f"Weight version updated to {obj.new_version}",
|
665
|
+
"new_version": obj.new_version,
|
666
|
+
},
|
667
|
+
status_code=HTTPStatus.OK,
|
668
|
+
)
|
669
|
+
except Exception as e:
|
670
|
+
return ORJSONResponse(
|
671
|
+
{
|
672
|
+
"success": False,
|
673
|
+
"message": f"Failed to update weight version: {str(e)}",
|
674
|
+
},
|
675
|
+
status_code=HTTPStatus.BAD_REQUEST,
|
676
|
+
)
|
677
|
+
|
678
|
+
|
609
679
|
@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
|
610
680
|
async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
|
611
681
|
"""Get model parameter by name."""
|
@@ -966,6 +1036,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
|
|
966
1036
|
return ORJSONResponse({"predictions": ret})
|
967
1037
|
|
968
1038
|
|
1039
|
+
def _update_weight_version_if_provided(weight_version: Optional[str]) -> None:
|
1040
|
+
"""Update weight version if provided."""
|
1041
|
+
if weight_version is not None:
|
1042
|
+
_global_state.tokenizer_manager.server_args.weight_version = weight_version
|
1043
|
+
|
1044
|
+
|
969
1045
|
def _create_error_response(e):
|
970
1046
|
return ORJSONResponse(
|
971
1047
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
@@ -240,6 +240,7 @@ class CompletionResponse(BaseModel):
|
|
240
240
|
model: str
|
241
241
|
choices: List[CompletionResponseChoice]
|
242
242
|
usage: UsageInfo
|
243
|
+
metadata: Optional[Dict[str, Any]] = None
|
243
244
|
|
244
245
|
|
245
246
|
class CompletionResponseStreamChoice(BaseModel):
|
@@ -517,6 +518,7 @@ class ChatCompletionResponse(BaseModel):
|
|
517
518
|
model: str
|
518
519
|
choices: List[ChatCompletionResponseChoice]
|
519
520
|
usage: UsageInfo
|
521
|
+
metadata: Optional[Dict[str, Any]] = None
|
520
522
|
|
521
523
|
|
522
524
|
class DeltaMessage(BaseModel):
|
@@ -735,8 +737,8 @@ class ResponsesRequest(BaseModel):
|
|
735
737
|
else:
|
736
738
|
max_tokens = default_max_tokens
|
737
739
|
|
738
|
-
# Avoid exceed the context length by minus
|
739
|
-
max_tokens -=
|
740
|
+
# Avoid exceed the context length by minus 2 token
|
741
|
+
max_tokens -= 2
|
740
742
|
|
741
743
|
# Get parameters with defaults
|
742
744
|
temperature = self.temperature
|
@@ -81,12 +81,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
81
81
|
f"This model supports at most {server_context_length} completion tokens."
|
82
82
|
)
|
83
83
|
|
84
|
+
if request.response_format and request.response_format.type == "json_schema":
|
85
|
+
schema = getattr(request.response_format.json_schema, "schema_", None)
|
86
|
+
if schema is None:
|
87
|
+
return "schema_ is required for json_schema response format request."
|
88
|
+
|
84
89
|
return None
|
85
90
|
|
86
91
|
def _convert_to_internal_request(
|
87
92
|
self,
|
88
93
|
request: ChatCompletionRequest,
|
89
94
|
) -> tuple[GenerateReqInput, ChatCompletionRequest]:
|
95
|
+
reasoning_effort = (
|
96
|
+
request.chat_template_kwargs.pop("reasoning_effort", None)
|
97
|
+
if request.chat_template_kwargs
|
98
|
+
else None
|
99
|
+
)
|
100
|
+
if reasoning_effort is not None:
|
101
|
+
request.reasoning_effort = reasoning_effort
|
102
|
+
|
90
103
|
"""Convert OpenAI chat completion request to internal format"""
|
91
104
|
is_multimodal = self.tokenizer_manager.model_config.is_multimodal
|
92
105
|
|
@@ -723,6 +736,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
723
736
|
model=request.model,
|
724
737
|
choices=choices,
|
725
738
|
usage=usage,
|
739
|
+
metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
|
726
740
|
)
|
727
741
|
|
728
742
|
def _process_logprobs_tokens(
|
@@ -858,12 +872,15 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
858
872
|
Returns:
|
859
873
|
The boolean value of 'enable_thinking' if found, otherwise False.
|
860
874
|
"""
|
861
|
-
if (
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
875
|
+
if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
|
876
|
+
# For Qwen3 models, `enable_thinking` is supported.
|
877
|
+
if request.chat_template_kwargs.get("enable_thinking") is not None:
|
878
|
+
return request.chat_template_kwargs.get("enable_thinking")
|
879
|
+
# For DeepSeek-V3.1 models, `thinking` is supported.
|
880
|
+
elif request.chat_template_kwargs.get("thinking") is not None:
|
881
|
+
return request.chat_template_kwargs.get("thinking")
|
882
|
+
else:
|
883
|
+
return False
|
867
884
|
return False
|
868
885
|
|
869
886
|
async def _process_tool_call_stream(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
-
from typing import Any, AsyncGenerator, Dict, List, Union
|
3
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
4
4
|
|
5
5
|
from fastapi import Request
|
6
6
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
@@ -41,6 +41,14 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
41
41
|
def _request_id_prefix(self) -> str:
|
42
42
|
return "cmpl-"
|
43
43
|
|
44
|
+
def _validate_request(self, request: CompletionRequest) -> Optional[str]:
|
45
|
+
"""Validate that the input is valid."""
|
46
|
+
prompt = request.prompt
|
47
|
+
if not prompt or (isinstance(prompt, list) and all(not p for p in prompt)):
|
48
|
+
return "Prompt cannot be empty"
|
49
|
+
|
50
|
+
return None
|
51
|
+
|
44
52
|
def _convert_to_internal_request(
|
45
53
|
self,
|
46
54
|
request: CompletionRequest,
|
@@ -373,6 +381,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
373
381
|
created=created,
|
374
382
|
choices=choices,
|
375
383
|
usage=usage,
|
384
|
+
metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
|
376
385
|
)
|
377
386
|
|
378
387
|
def _get_echo_text(self, request: CompletionRequest, index: int) -> str:
|
@@ -944,7 +944,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
944
944
|
type="output_text",
|
945
945
|
text="",
|
946
946
|
annotations=[],
|
947
|
-
logprobs=
|
947
|
+
logprobs=None,
|
948
948
|
),
|
949
949
|
)
|
950
950
|
)
|
@@ -992,7 +992,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
992
992
|
type="output_text",
|
993
993
|
text="",
|
994
994
|
annotations=[],
|
995
|
-
logprobs=
|
995
|
+
logprobs=None,
|
996
996
|
),
|
997
997
|
)
|
998
998
|
)
|
@@ -25,7 +25,6 @@ import torch
|
|
25
25
|
import torch.distributed
|
26
26
|
|
27
27
|
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
28
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
29
28
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
30
29
|
from sglang.srt.server_args import ServerArgs
|
31
30
|
from sglang.srt.utils import Withable, get_bool_env_var
|
@@ -288,14 +287,14 @@ class _SinglePassGatherer(ABC):
|
|
288
287
|
)
|
289
288
|
|
290
289
|
if server_args.expert_distribution_recorder_mode == "stat_approx":
|
291
|
-
if server_args.moe_a2a_backend
|
290
|
+
if server_args.moe_a2a_backend != "none" and (
|
292
291
|
server_args.deepep_mode == "normal"
|
293
292
|
):
|
294
293
|
return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
|
295
294
|
else:
|
296
295
|
raise NotImplementedError
|
297
296
|
|
298
|
-
if server_args.moe_a2a_backend
|
297
|
+
if server_args.moe_a2a_backend != "none":
|
299
298
|
if server_args.deepep_mode == "normal":
|
300
299
|
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
301
300
|
elif server_args.deepep_mode == "low_latency":
|
@@ -215,6 +215,6 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
215
215
|
sequence_start_token=self.bot_token,
|
216
216
|
sequence_end_token=self.eot_token,
|
217
217
|
tool_call_separator="",
|
218
|
-
call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n"
|
218
|
+
call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n"{arguments_rule}"\\n```<|tool▁call▁end|>"',
|
219
219
|
function_format="json",
|
220
220
|
)
|
@@ -129,6 +129,25 @@ def get_config(
|
|
129
129
|
config = AutoConfig.from_pretrained(
|
130
130
|
model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
|
131
131
|
)
|
132
|
+
if (
|
133
|
+
config.architectures is not None
|
134
|
+
and config.architectures[0] == "Phi4MMForCausalLM"
|
135
|
+
):
|
136
|
+
# Phi4MMForCausalLM uses a hard-coded vision_config. See:
|
137
|
+
# https://github.com/vllm-project/vllm/blob/6071e989df1531b59ef35568f83f7351afb0b51e/vllm/model_executor/models/phi4mm.py#L71
|
138
|
+
# We set it here to support cases where num_attention_heads is not divisible by the TP size.
|
139
|
+
from transformers import SiglipVisionConfig
|
140
|
+
|
141
|
+
vision_config = {
|
142
|
+
"hidden_size": 1152,
|
143
|
+
"image_size": 448,
|
144
|
+
"intermediate_size": 4304,
|
145
|
+
"model_type": "siglip_vision_model",
|
146
|
+
"num_attention_heads": 16,
|
147
|
+
"num_hidden_layers": 26, # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
|
148
|
+
"patch_size": 14,
|
149
|
+
}
|
150
|
+
config.vision_config = SiglipVisionConfig(**vision_config)
|
132
151
|
text_config = get_hf_text_config(config=config)
|
133
152
|
|
134
153
|
if isinstance(model, str) and text_config is not None:
|
@@ -244,6 +263,11 @@ def get_tokenizer(
|
|
244
263
|
**kwargs,
|
245
264
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
246
265
|
"""Gets a tokenizer for the given model name via Huggingface."""
|
266
|
+
if tokenizer_name.endswith(".json"):
|
267
|
+
from sglang.srt.tokenizer.tiktoken_tokenizer import TiktokenTokenizer
|
268
|
+
|
269
|
+
return TiktokenTokenizer(tokenizer_name)
|
270
|
+
|
247
271
|
if tokenizer_mode == "slow":
|
248
272
|
if kwargs.get("use_fast", False):
|
249
273
|
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
@@ -0,0 +1,83 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from multiprocessing import shared_memory
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import List, Optional
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import torch
|
10
|
+
|
11
|
+
from sglang.srt.distributed.naive_distributed import get_naive_distributed
|
12
|
+
from sglang.srt.utils import check_cuda_result
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class HostSharedMemoryManager:
|
18
|
+
def __init__(self, base_name: str):
|
19
|
+
self._base_name = Path(base_name)
|
20
|
+
self._operation_index = 0
|
21
|
+
self._records: List[_Record] = []
|
22
|
+
|
23
|
+
def malloc(self, *, shape, dtype):
|
24
|
+
meta_tensor = torch.empty(size=shape, dtype=dtype, device="meta")
|
25
|
+
raw = self._malloc_raw(num_bytes=meta_tensor.nbytes)
|
26
|
+
return raw.view(dtype).view(*shape)
|
27
|
+
|
28
|
+
def _malloc_raw(self, *, num_bytes: int) -> torch.Tensor:
|
29
|
+
import cuda.bindings.runtime as cuda_rt
|
30
|
+
|
31
|
+
self._operation_index += 1
|
32
|
+
shm_name = f"{self._base_name}_op{self._operation_index}"
|
33
|
+
|
34
|
+
# TODO handle dispose
|
35
|
+
if get_naive_distributed().get_rank() == 0:
|
36
|
+
shm = shared_memory.SharedMemory(name=shm_name, create=True, size=num_bytes)
|
37
|
+
|
38
|
+
get_naive_distributed().barrier()
|
39
|
+
|
40
|
+
if get_naive_distributed().get_rank() != 0:
|
41
|
+
shm = shared_memory.SharedMemory(name=shm_name)
|
42
|
+
|
43
|
+
np_array = np.ndarray((num_bytes,), dtype=np.uint8, buffer=shm.buf)
|
44
|
+
tensor = torch.from_numpy(np_array)
|
45
|
+
|
46
|
+
check_cuda_result(
|
47
|
+
cuda_rt.cudaHostRegister(
|
48
|
+
tensor.data_ptr(), num_bytes, cuda_rt.cudaHostRegisterPortable
|
49
|
+
)
|
50
|
+
)
|
51
|
+
|
52
|
+
get_naive_distributed().barrier()
|
53
|
+
|
54
|
+
self._records.append(
|
55
|
+
_Record(
|
56
|
+
shm=shm,
|
57
|
+
np_array=np_array,
|
58
|
+
tensor=tensor,
|
59
|
+
)
|
60
|
+
)
|
61
|
+
return tensor
|
62
|
+
|
63
|
+
|
64
|
+
@dataclass
|
65
|
+
class _Record:
|
66
|
+
shm: shared_memory.SharedMemory
|
67
|
+
np_array: np.ndarray
|
68
|
+
tensor: torch.Tensor
|
69
|
+
|
70
|
+
|
71
|
+
# Can have multi instances if needed
|
72
|
+
_instance: Optional[HostSharedMemoryManager] = None
|
73
|
+
|
74
|
+
|
75
|
+
def get_host_shared_memory_manager():
|
76
|
+
assert _instance is not None
|
77
|
+
return _instance
|
78
|
+
|
79
|
+
|
80
|
+
def set_host_shared_memory_manager(instance: HostSharedMemoryManager):
|
81
|
+
global _instance
|
82
|
+
assert _instance is None
|
83
|
+
_instance = instance
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from typing import TYPE_CHECKING, Optional
|
4
|
+
from typing import TYPE_CHECKING, List, Optional
|
5
5
|
|
6
6
|
import torch
|
7
7
|
import torch_npu
|
@@ -27,6 +27,7 @@ class ForwardMetadata:
|
|
27
27
|
# seq len inputs
|
28
28
|
extend_seq_lens_cpu_int: Optional[torch.Tensor] = None
|
29
29
|
seq_lens_cpu_int: Optional[torch.Tensor] = None
|
30
|
+
seq_lens_cpu_list: Optional[List[int]] = None
|
30
31
|
|
31
32
|
|
32
33
|
class AscendAttnBackend(AttentionBackend):
|
@@ -51,7 +52,7 @@ class AscendAttnBackend(AttentionBackend):
|
|
51
52
|
|
52
53
|
def __init__(self, model_runner: ModelRunner):
|
53
54
|
super().__init__()
|
54
|
-
self.forward_metadata =
|
55
|
+
self.forward_metadata = None
|
55
56
|
self.device = model_runner.device
|
56
57
|
self.gen_attention_mask(128, model_runner.dtype)
|
57
58
|
self.page_size = model_runner.page_size
|
@@ -60,9 +61,15 @@ class AscendAttnBackend(AttentionBackend):
|
|
60
61
|
self.kv_lora_rank = model_runner.model_config.kv_lora_rank
|
61
62
|
self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
|
62
63
|
self.native_attn = TorchNativeAttnBackend(model_runner)
|
64
|
+
self.graph_metadata = {}
|
65
|
+
self.max_context_len = model_runner.model_config.context_len
|
66
|
+
self.req_to_token = model_runner.req_to_token_pool.req_to_token
|
67
|
+
self.graph_mode = False
|
63
68
|
|
64
69
|
def init_forward_metadata(self, forward_batch: ForwardBatch):
|
65
70
|
"""Init the metadata for a forward pass."""
|
71
|
+
self.forward_metadata = ForwardMetadata()
|
72
|
+
|
66
73
|
self.forward_metadata.block_tables = (
|
67
74
|
forward_batch.req_to_token_pool.req_to_token[
|
68
75
|
forward_batch.req_pool_indices, : forward_batch.seq_lens.max()
|
@@ -75,6 +82,63 @@ class AscendAttnBackend(AttentionBackend):
|
|
75
82
|
)
|
76
83
|
self.forward_metadata.seq_lens_cpu_int = forward_batch.seq_lens_cpu.int()
|
77
84
|
|
85
|
+
self.graph_mode = False
|
86
|
+
|
87
|
+
def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
|
88
|
+
self.graph_metadata = {
|
89
|
+
"block_tables": torch.empty(
|
90
|
+
(max_bs, self.max_context_len // self.page_size),
|
91
|
+
dtype=torch.int32,
|
92
|
+
device=self.device,
|
93
|
+
),
|
94
|
+
}
|
95
|
+
|
96
|
+
def init_forward_metadata_capture_cuda_graph(
|
97
|
+
self,
|
98
|
+
bs: int,
|
99
|
+
num_tokens: int,
|
100
|
+
req_pool_indices: torch.Tensor,
|
101
|
+
seq_lens: torch.Tensor,
|
102
|
+
encoder_lens: Optional[torch.Tensor],
|
103
|
+
forward_mode: ForwardMode,
|
104
|
+
spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
|
105
|
+
):
|
106
|
+
metadata = ForwardMetadata()
|
107
|
+
|
108
|
+
metadata.block_tables = self.graph_metadata["block_tables"][:bs, :]
|
109
|
+
metadata.seq_lens_cpu_list = seq_lens.cpu().int().tolist()
|
110
|
+
|
111
|
+
self.graph_metadata[bs] = metadata
|
112
|
+
self.forward_metadata = metadata
|
113
|
+
|
114
|
+
self.graph_mode = True
|
115
|
+
|
116
|
+
def init_forward_metadata_replay_cuda_graph(
|
117
|
+
self,
|
118
|
+
bs: int,
|
119
|
+
req_pool_indices: torch.Tensor,
|
120
|
+
seq_lens: torch.Tensor,
|
121
|
+
seq_lens_sum: int,
|
122
|
+
encoder_lens: Optional[torch.Tensor],
|
123
|
+
forward_mode: ForwardMode,
|
124
|
+
spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
|
125
|
+
seq_lens_cpu: Optional[torch.Tensor],
|
126
|
+
):
|
127
|
+
metadata = self.graph_metadata[bs]
|
128
|
+
max_len = seq_lens_cpu[:bs].max().item()
|
129
|
+
max_seq_pages = (max_len + self.page_size - 1) // self.page_size
|
130
|
+
|
131
|
+
metadata.block_tables[:bs, :max_seq_pages].copy_(
|
132
|
+
self.req_to_token[req_pool_indices[:bs], :max_len][:, :: self.page_size]
|
133
|
+
// self.page_size
|
134
|
+
)
|
135
|
+
metadata.block_tables[:bs, max_seq_pages:].fill_(0)
|
136
|
+
metadata.block_tables[bs:, :].fill_(0)
|
137
|
+
|
138
|
+
self.forward_metadata = metadata
|
139
|
+
|
140
|
+
self.graph_mode = True
|
141
|
+
|
78
142
|
def get_cuda_graph_seq_len_fill_value(self):
|
79
143
|
return 1
|
80
144
|
|
@@ -167,28 +231,74 @@ class AscendAttnBackend(AttentionBackend):
|
|
167
231
|
layer, forward_batch.out_cache_loc, k, v
|
168
232
|
)
|
169
233
|
if not self.use_mla:
|
170
|
-
|
171
|
-
|
234
|
+
if self.graph_mode:
|
235
|
+
k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
|
236
|
+
layer.layer_id
|
237
|
+
).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
|
238
|
+
v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
|
239
|
+
layer.layer_id
|
240
|
+
).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
|
241
|
+
query = q.view(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
|
242
|
+
num_tokens = query.shape[0]
|
243
|
+
workspace = (
|
244
|
+
torch_npu._npu_fused_infer_attention_score_get_max_workspace(
|
245
|
+
query,
|
246
|
+
k_cache,
|
247
|
+
v_cache,
|
248
|
+
block_table=self.forward_metadata.block_tables,
|
249
|
+
block_size=self.page_size,
|
250
|
+
num_heads=layer.tp_q_head_num,
|
251
|
+
num_key_value_heads=layer.tp_k_head_num,
|
252
|
+
input_layout="BSH",
|
253
|
+
scale=layer.scaling,
|
254
|
+
actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_list,
|
255
|
+
)
|
256
|
+
)
|
257
|
+
output = torch.empty(
|
258
|
+
(num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
|
259
|
+
dtype=q.dtype,
|
260
|
+
device=q.device,
|
261
|
+
)
|
262
|
+
softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
|
263
|
+
torch_npu.npu_fused_infer_attention_score.out(
|
264
|
+
query,
|
265
|
+
k_cache,
|
266
|
+
v_cache,
|
267
|
+
block_table=self.forward_metadata.block_tables,
|
268
|
+
block_size=self.page_size,
|
269
|
+
num_heads=layer.tp_q_head_num,
|
270
|
+
num_key_value_heads=layer.tp_k_head_num,
|
271
|
+
input_layout="BSH",
|
272
|
+
scale=layer.scaling,
|
273
|
+
actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_list,
|
274
|
+
workspace=workspace,
|
275
|
+
out=[output, softmax_lse],
|
276
|
+
)
|
277
|
+
else:
|
278
|
+
k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
|
279
|
+
v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
|
280
|
+
layer.layer_id
|
281
|
+
)
|
172
282
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
283
|
+
query = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
|
284
|
+
num_tokens = query.shape[0]
|
285
|
+
output = torch.empty(
|
286
|
+
(num_tokens, layer.tp_q_head_num, layer.v_head_dim),
|
287
|
+
dtype=query.dtype,
|
288
|
+
device=query.device,
|
289
|
+
)
|
180
290
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
291
|
+
torch_npu._npu_paged_attention(
|
292
|
+
query=query,
|
293
|
+
key_cache=k_cache,
|
294
|
+
value_cache=v_cache,
|
295
|
+
num_heads=layer.tp_q_head_num,
|
296
|
+
num_kv_heads=layer.tp_k_head_num,
|
297
|
+
scale_value=layer.scaling,
|
298
|
+
block_table=self.forward_metadata.block_tables,
|
299
|
+
context_lens=self.forward_metadata.seq_lens_cpu_int,
|
300
|
+
out=output,
|
301
|
+
)
|
192
302
|
return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
|
193
303
|
else:
|
194
304
|
query = q.view(-1, layer.tp_q_head_num, layer.head_dim)
|