sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +26 -4
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +676 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +49 -8
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
- sglang/srt/distributed/parallel_state.py +42 -8
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +78 -13
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +133 -55
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +434 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +41 -19
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +25 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/topk.py +60 -20
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +80 -53
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +25 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -19
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +78 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +87 -33
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +67 -30
- sglang/srt/lora/mem_pool.py +117 -52
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +18 -1
- sglang/srt/managers/cache_controller.py +2 -5
- sglang/srt/managers/data_parallel_controller.py +30 -8
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +43 -5
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/clip.py +63 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +134 -30
- sglang/srt/managers/scheduler.py +290 -31
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -24
- sglang/srt/managers/tp_worker.py +4 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +18 -7
- sglang/srt/mem_cache/memory_pool.py +255 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +36 -21
- sglang/srt/model_executor/forward_batch_info.py +68 -11
- sglang/srt/model_executor/model_runner.py +75 -8
- sglang/srt/model_loader/loader.py +171 -3
- sglang/srt/model_loader/weight_utils.py +51 -3
- sglang/srt/models/clip.py +563 -0
- sglang/srt/models/deepseek_janus_pro.py +31 -88
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +329 -73
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +694 -0
- sglang/srt/models/gemma3_mm.py +468 -0
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +201 -104
- sglang/srt/openai_api/protocol.py +33 -7
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +114 -14
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +140 -54
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +215 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +29 -2
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +56 -5
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
sglang/srt/openai_api/adapter.py
CHANGED
@@ -26,13 +26,10 @@ from fastapi import HTTPException, Request, UploadFile
|
|
26
26
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# outlines.integrations.utils
|
34
|
-
from outlines.integrations.utils import convert_json_schema_to_str
|
35
|
-
|
29
|
+
from sglang.srt.code_completion_parser import (
|
30
|
+
generate_completion_prompt_from_request,
|
31
|
+
is_completion_template_defined,
|
32
|
+
)
|
36
33
|
from sglang.srt.conversation import (
|
37
34
|
Conversation,
|
38
35
|
SeparatorStyle,
|
@@ -41,7 +38,7 @@ from sglang.srt.conversation import (
|
|
41
38
|
generate_embedding_convs,
|
42
39
|
register_conv_template,
|
43
40
|
)
|
44
|
-
from sglang.srt.function_call_parser import
|
41
|
+
from sglang.srt.function_call_parser import FunctionCallParser
|
45
42
|
from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
|
46
43
|
from sglang.srt.openai_api.protocol import (
|
47
44
|
BatchRequest,
|
@@ -75,7 +72,7 @@ from sglang.srt.openai_api.protocol import (
|
|
75
72
|
UsageInfo,
|
76
73
|
)
|
77
74
|
from sglang.srt.reasoning_parser import ReasoningParser
|
78
|
-
from sglang.utils import get_exception_traceback
|
75
|
+
from sglang.utils import convert_json_schema_to_str, get_exception_traceback
|
79
76
|
|
80
77
|
logger = logging.getLogger(__name__)
|
81
78
|
|
@@ -310,6 +307,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
310
307
|
)
|
311
308
|
|
312
309
|
try:
|
310
|
+
created = int(time.time())
|
313
311
|
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
314
312
|
if not isinstance(ret, list):
|
315
313
|
ret = [ret]
|
@@ -317,13 +315,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
317
315
|
responses = v1_chat_generate_response(
|
318
316
|
request,
|
319
317
|
ret,
|
318
|
+
created,
|
320
319
|
to_file=True,
|
321
320
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
322
321
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
323
322
|
)
|
324
323
|
else:
|
325
324
|
responses = v1_generate_response(
|
326
|
-
request,
|
325
|
+
request,
|
326
|
+
ret,
|
327
|
+
tokenizer_manager,
|
328
|
+
created,
|
329
|
+
to_file=True,
|
330
|
+
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
327
331
|
)
|
328
332
|
|
329
333
|
except Exception as e:
|
@@ -504,7 +508,11 @@ def v1_generate_request(
|
|
504
508
|
"To compute logprobs of input prompt, please use the native /generate API."
|
505
509
|
)
|
506
510
|
|
507
|
-
|
511
|
+
prompt = request.prompt
|
512
|
+
if is_completion_template_defined():
|
513
|
+
prompt = generate_completion_prompt_from_request(request)
|
514
|
+
prompts.append(prompt)
|
515
|
+
|
508
516
|
lora_paths.append(request.lora_path)
|
509
517
|
if request.echo and request.logprobs:
|
510
518
|
current_logprob_start_len = 0
|
@@ -569,7 +577,9 @@ def v1_generate_request(
|
|
569
577
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
570
578
|
|
571
579
|
|
572
|
-
def v1_generate_response(
|
580
|
+
def v1_generate_response(
|
581
|
+
request, ret, tokenizer_manager, created, to_file=False, cache_report=False
|
582
|
+
):
|
573
583
|
choices = []
|
574
584
|
echo = False
|
575
585
|
|
@@ -635,7 +645,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
635
645
|
"index": 0,
|
636
646
|
"text": text,
|
637
647
|
"logprobs": logprobs,
|
638
|
-
"finish_reason":
|
648
|
+
"finish_reason": finish_reason["type"] if finish_reason else None,
|
639
649
|
"matched_stop": (
|
640
650
|
finish_reason["matched"]
|
641
651
|
if finish_reason and "matched" in finish_reason
|
@@ -647,7 +657,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
647
657
|
index=idx,
|
648
658
|
text=text,
|
649
659
|
logprobs=logprobs,
|
650
|
-
finish_reason=
|
660
|
+
finish_reason=finish_reason["type"] if finish_reason else None,
|
651
661
|
matched_stop=(
|
652
662
|
finish_reason["matched"]
|
653
663
|
if finish_reason and "matched" in finish_reason
|
@@ -667,7 +677,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
667
677
|
# remain the same but if needed we can change that
|
668
678
|
"id": ret[i]["meta_info"]["id"],
|
669
679
|
"object": "text_completion",
|
670
|
-
"created":
|
680
|
+
"created": created,
|
671
681
|
"model": request[i].model,
|
672
682
|
"choices": choice,
|
673
683
|
"usage": {
|
@@ -686,14 +696,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
686
696
|
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
687
697
|
)
|
688
698
|
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
699
|
+
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
689
700
|
response = CompletionResponse(
|
690
701
|
id=ret[0]["meta_info"]["id"],
|
691
702
|
model=request.model,
|
703
|
+
created=created,
|
692
704
|
choices=choices,
|
693
705
|
usage=UsageInfo(
|
694
706
|
prompt_tokens=prompt_tokens,
|
695
707
|
completion_tokens=completion_tokens,
|
696
708
|
total_tokens=prompt_tokens + completion_tokens,
|
709
|
+
prompt_tokens_details=(
|
710
|
+
{"cached_tokens": cached_tokens} if cache_report else None
|
711
|
+
),
|
697
712
|
),
|
698
713
|
)
|
699
714
|
return response
|
@@ -702,6 +717,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
702
717
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
703
718
|
request_json = await raw_request.json()
|
704
719
|
all_requests = [CompletionRequest(**request_json)]
|
720
|
+
created = int(time.time())
|
705
721
|
adapted_request, request = v1_generate_request(all_requests)
|
706
722
|
|
707
723
|
if adapted_request.stream:
|
@@ -711,6 +727,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
711
727
|
n_prev_tokens = {}
|
712
728
|
prompt_tokens = {}
|
713
729
|
completion_tokens = {}
|
730
|
+
cached_tokens = {}
|
731
|
+
|
714
732
|
try:
|
715
733
|
async for content in tokenizer_manager.generate_request(
|
716
734
|
adapted_request, raw_request
|
@@ -723,6 +741,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
723
741
|
text = content["text"]
|
724
742
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
725
743
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
744
|
+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
726
745
|
|
727
746
|
if not stream_buffer: # The first chunk
|
728
747
|
if request.echo:
|
@@ -786,7 +805,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
786
805
|
index=index,
|
787
806
|
text=delta,
|
788
807
|
logprobs=logprobs,
|
789
|
-
finish_reason=
|
808
|
+
finish_reason=finish_reason["type"] if finish_reason else None,
|
790
809
|
matched_stop=(
|
791
810
|
finish_reason["matched"]
|
792
811
|
if finish_reason and "matched" in finish_reason
|
@@ -795,6 +814,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
795
814
|
)
|
796
815
|
chunk = CompletionStreamResponse(
|
797
816
|
id=content["meta_info"]["id"],
|
817
|
+
created=created,
|
798
818
|
object="text_completion",
|
799
819
|
choices=[choice_data],
|
800
820
|
model=request.model,
|
@@ -813,14 +833,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
813
833
|
total_completion_tokens = sum(
|
814
834
|
tokens for tokens in completion_tokens.values()
|
815
835
|
)
|
836
|
+
cache_report = tokenizer_manager.server_args.enable_cache_report
|
837
|
+
if cache_report:
|
838
|
+
cached_tokens_sum = sum(
|
839
|
+
tokens for tokens in cached_tokens.values()
|
840
|
+
)
|
841
|
+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
842
|
+
else:
|
843
|
+
prompt_tokens_details = None
|
816
844
|
usage = UsageInfo(
|
817
845
|
prompt_tokens=total_prompt_tokens,
|
818
846
|
completion_tokens=total_completion_tokens,
|
819
847
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
848
|
+
prompt_tokens_details=prompt_tokens_details,
|
820
849
|
)
|
821
850
|
|
822
851
|
final_usage_chunk = CompletionStreamResponse(
|
823
852
|
id=content["meta_info"]["id"],
|
853
|
+
created=created,
|
824
854
|
choices=[],
|
825
855
|
model=request.model,
|
826
856
|
usage=usage,
|
@@ -851,7 +881,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
851
881
|
if not isinstance(ret, list):
|
852
882
|
ret = [ret]
|
853
883
|
|
854
|
-
response = v1_generate_response(
|
884
|
+
response = v1_generate_response(
|
885
|
+
request,
|
886
|
+
ret,
|
887
|
+
tokenizer_manager,
|
888
|
+
created,
|
889
|
+
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
890
|
+
)
|
855
891
|
return response
|
856
892
|
|
857
893
|
|
@@ -863,6 +899,7 @@ def v1_chat_generate_request(
|
|
863
899
|
input_ids = []
|
864
900
|
sampling_params_list = []
|
865
901
|
image_data_list = []
|
902
|
+
audio_data_list = []
|
866
903
|
return_logprobs = []
|
867
904
|
logprob_start_lens = []
|
868
905
|
top_logprobs_nums = []
|
@@ -876,7 +913,9 @@ def v1_chat_generate_request(
|
|
876
913
|
# - prompt: The full prompt string.
|
877
914
|
# - stop: Custom stop tokens.
|
878
915
|
# - image_data: None or a list of image strings (URLs or base64 strings).
|
916
|
+
# - audio_data: None or a list of audio strings (URLs).
|
879
917
|
# None skips any image processing in GenerateReqInput.
|
918
|
+
strict_tag = None
|
880
919
|
if not isinstance(request.messages, str):
|
881
920
|
# Apply chat template and its stop strings.
|
882
921
|
tools = None
|
@@ -891,6 +930,10 @@ def v1_chat_generate_request(
|
|
891
930
|
else:
|
892
931
|
tools = [item.function.model_dump() for item in request.tools]
|
893
932
|
|
933
|
+
tool_call_parser = tokenizer_manager.server_args.tool_call_parser
|
934
|
+
parser = FunctionCallParser(request.tools, tool_call_parser)
|
935
|
+
strict_tag = parser.get_structure_tag()
|
936
|
+
|
894
937
|
if chat_template_name is None:
|
895
938
|
openai_compatible_messages = []
|
896
939
|
for message in request.messages:
|
@@ -920,7 +963,7 @@ def v1_chat_generate_request(
|
|
920
963
|
)
|
921
964
|
except:
|
922
965
|
# This except branch will be triggered when the chosen model
|
923
|
-
# has a different tools input format that is not
|
966
|
+
# has a different tools input format that is not compatible
|
924
967
|
# with openAI's apply_chat_template tool_call format, like Mistral.
|
925
968
|
tools = [t if "function" in t else {"function": t} for t in tools]
|
926
969
|
prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
|
@@ -940,11 +983,13 @@ def v1_chat_generate_request(
|
|
940
983
|
prompt_ids += encoded
|
941
984
|
stop = request.stop
|
942
985
|
image_data = None
|
986
|
+
audio_data = None
|
943
987
|
modalities = []
|
944
988
|
else:
|
945
989
|
conv = generate_chat_conv(request, chat_template_name)
|
946
990
|
prompt = conv.get_prompt()
|
947
991
|
image_data = conv.image_data
|
992
|
+
audio_data = conv.audio_data
|
948
993
|
modalities = conv.modalities
|
949
994
|
stop = conv.stop_str or []
|
950
995
|
if request.stop:
|
@@ -958,6 +1003,7 @@ def v1_chat_generate_request(
|
|
958
1003
|
prompt_ids = request.messages
|
959
1004
|
stop = request.stop
|
960
1005
|
image_data = None
|
1006
|
+
audio_data = None
|
961
1007
|
modalities = []
|
962
1008
|
input_ids.append(prompt_ids)
|
963
1009
|
return_logprobs.append(request.logprobs)
|
@@ -995,9 +1041,26 @@ def v1_chat_generate_request(
|
|
995
1041
|
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
996
1042
|
request.response_format.model_dump(by_alias=True)
|
997
1043
|
)
|
1044
|
+
|
1045
|
+
if strict_tag is not None:
|
1046
|
+
if (
|
1047
|
+
sampling_params.get("regex")
|
1048
|
+
or sampling_params.get("ebnf")
|
1049
|
+
or sampling_params.get("structural_tag")
|
1050
|
+
or sampling_params.get("json_schema")
|
1051
|
+
):
|
1052
|
+
logger.warning(
|
1053
|
+
"Constrained decoding is not compatible with tool calls."
|
1054
|
+
)
|
1055
|
+
else:
|
1056
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
1057
|
+
strict_tag.model_dump(by_alias=True)
|
1058
|
+
)
|
1059
|
+
|
998
1060
|
sampling_params_list.append(sampling_params)
|
999
1061
|
|
1000
1062
|
image_data_list.append(image_data)
|
1063
|
+
audio_data_list.append(audio_data)
|
1001
1064
|
modalities_list.append(modalities)
|
1002
1065
|
if len(all_requests) == 1:
|
1003
1066
|
if isinstance(input_ids[0], str):
|
@@ -1006,6 +1069,7 @@ def v1_chat_generate_request(
|
|
1006
1069
|
prompt_kwargs = {"input_ids": input_ids[0]}
|
1007
1070
|
sampling_params_list = sampling_params_list[0]
|
1008
1071
|
image_data_list = image_data_list[0]
|
1072
|
+
audio_data_list = audio_data_list[0]
|
1009
1073
|
return_logprobs = return_logprobs[0]
|
1010
1074
|
logprob_start_lens = logprob_start_lens[0]
|
1011
1075
|
top_logprobs_nums = top_logprobs_nums[0]
|
@@ -1020,6 +1084,7 @@ def v1_chat_generate_request(
|
|
1020
1084
|
adapted_request = GenerateReqInput(
|
1021
1085
|
**prompt_kwargs,
|
1022
1086
|
image_data=image_data_list,
|
1087
|
+
audio_data=audio_data_list,
|
1023
1088
|
sampling_params=sampling_params_list,
|
1024
1089
|
return_logprob=return_logprobs,
|
1025
1090
|
logprob_start_len=logprob_start_lens,
|
@@ -1037,6 +1102,7 @@ def v1_chat_generate_request(
|
|
1037
1102
|
def v1_chat_generate_response(
|
1038
1103
|
request,
|
1039
1104
|
ret,
|
1105
|
+
created,
|
1040
1106
|
to_file=False,
|
1041
1107
|
cache_report=False,
|
1042
1108
|
tool_call_parser=None,
|
@@ -1053,7 +1119,9 @@ def v1_chat_generate_response(
|
|
1053
1119
|
if logprobs:
|
1054
1120
|
logprobs = to_openai_style_logprobs(
|
1055
1121
|
output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
|
1056
|
-
output_top_logprobs=ret_item["meta_info"]
|
1122
|
+
output_top_logprobs=ret_item["meta_info"].get(
|
1123
|
+
"output_top_logprobs", None
|
1124
|
+
),
|
1057
1125
|
)
|
1058
1126
|
token_logprobs = []
|
1059
1127
|
for token_idx, (token, logprob) in enumerate(
|
@@ -1122,7 +1190,7 @@ def v1_chat_generate_response(
|
|
1122
1190
|
finish_reason["type"] = "tool_calls"
|
1123
1191
|
finish_reason["matched"] = None
|
1124
1192
|
try:
|
1125
|
-
|
1193
|
+
text, call_info_list = parser.parse_non_stream(text)
|
1126
1194
|
tool_calls = [
|
1127
1195
|
ToolCall(
|
1128
1196
|
id=str(call_info.tool_index),
|
@@ -1145,12 +1213,12 @@ def v1_chat_generate_response(
|
|
1145
1213
|
"index": 0,
|
1146
1214
|
"message": {
|
1147
1215
|
"role": "assistant",
|
1148
|
-
"content": text if
|
1216
|
+
"content": text if text else None,
|
1149
1217
|
"tool_calls": tool_calls,
|
1150
|
-
"reasoning_content": reasoning_text,
|
1218
|
+
"reasoning_content": reasoning_text if reasoning_text else None,
|
1151
1219
|
},
|
1152
1220
|
"logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
|
1153
|
-
"finish_reason":
|
1221
|
+
"finish_reason": finish_reason["type"] if finish_reason else None,
|
1154
1222
|
"matched_stop": (
|
1155
1223
|
finish_reason["matched"]
|
1156
1224
|
if finish_reason and "matched" in finish_reason
|
@@ -1162,12 +1230,12 @@ def v1_chat_generate_response(
|
|
1162
1230
|
index=idx,
|
1163
1231
|
message=ChatMessage(
|
1164
1232
|
role="assistant",
|
1165
|
-
content=text if
|
1233
|
+
content=text if text else None,
|
1166
1234
|
tool_calls=tool_calls,
|
1167
|
-
reasoning_content=reasoning_text,
|
1235
|
+
reasoning_content=reasoning_text if reasoning_text else None,
|
1168
1236
|
),
|
1169
1237
|
logprobs=choice_logprobs,
|
1170
|
-
finish_reason=
|
1238
|
+
finish_reason=finish_reason["type"] if finish_reason else None,
|
1171
1239
|
matched_stop=(
|
1172
1240
|
finish_reason["matched"]
|
1173
1241
|
if finish_reason and "matched" in finish_reason
|
@@ -1188,7 +1256,7 @@ def v1_chat_generate_response(
|
|
1188
1256
|
# remain the same but if needed we can change that
|
1189
1257
|
"id": ret[i]["meta_info"]["id"],
|
1190
1258
|
"object": "chat.completion",
|
1191
|
-
"created":
|
1259
|
+
"created": created,
|
1192
1260
|
"model": request[i].model,
|
1193
1261
|
"choices": choice,
|
1194
1262
|
"usage": {
|
@@ -1210,6 +1278,7 @@ def v1_chat_generate_response(
|
|
1210
1278
|
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
1211
1279
|
response = ChatCompletionResponse(
|
1212
1280
|
id=ret[0]["meta_info"]["id"],
|
1281
|
+
created=created,
|
1213
1282
|
model=request.model,
|
1214
1283
|
choices=choices,
|
1215
1284
|
usage=UsageInfo(
|
@@ -1224,9 +1293,12 @@ def v1_chat_generate_response(
|
|
1224
1293
|
return response
|
1225
1294
|
|
1226
1295
|
|
1227
|
-
async def v1_chat_completions(
|
1296
|
+
async def v1_chat_completions(
|
1297
|
+
tokenizer_manager, raw_request: Request, cache_report=False
|
1298
|
+
):
|
1228
1299
|
request_json = await raw_request.json()
|
1229
1300
|
all_requests = [ChatCompletionRequest(**request_json)]
|
1301
|
+
created = int(time.time())
|
1230
1302
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
1231
1303
|
|
1232
1304
|
if adapted_request.stream:
|
@@ -1239,6 +1311,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1239
1311
|
n_prev_tokens = {}
|
1240
1312
|
prompt_tokens = {}
|
1241
1313
|
completion_tokens = {}
|
1314
|
+
cached_tokens = {}
|
1242
1315
|
try:
|
1243
1316
|
async for content in tokenizer_manager.generate_request(
|
1244
1317
|
adapted_request, raw_request
|
@@ -1252,14 +1325,15 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1252
1325
|
|
1253
1326
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
1254
1327
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
1328
|
+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
1255
1329
|
if request.logprobs:
|
1256
1330
|
logprobs = to_openai_style_logprobs(
|
1257
1331
|
output_token_logprobs=content["meta_info"][
|
1258
1332
|
"output_token_logprobs"
|
1259
1333
|
][n_prev_token:],
|
1260
|
-
output_top_logprobs=content["meta_info"]
|
1261
|
-
"output_top_logprobs"
|
1262
|
-
|
1334
|
+
output_top_logprobs=content["meta_info"].get(
|
1335
|
+
"output_top_logprobs", []
|
1336
|
+
)[n_prev_token:],
|
1263
1337
|
)
|
1264
1338
|
|
1265
1339
|
n_prev_token = len(
|
@@ -1305,21 +1379,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1305
1379
|
if is_first:
|
1306
1380
|
# First chunk with role
|
1307
1381
|
is_first = False
|
1308
|
-
|
1309
|
-
tokenizer_manager.server_args.reasoning_parser
|
1310
|
-
and request.separate_reasoning
|
1311
|
-
):
|
1312
|
-
delta = DeltaMessage(role="assistant", reasoning_content="")
|
1313
|
-
else:
|
1314
|
-
delta = DeltaMessage(role="assistant", content="")
|
1382
|
+
delta = DeltaMessage(role="assistant")
|
1315
1383
|
choice_data = ChatCompletionResponseStreamChoice(
|
1316
1384
|
index=index,
|
1317
1385
|
delta=delta,
|
1318
|
-
finish_reason=
|
1319
|
-
None
|
1320
|
-
if finish_reason_type and len(finish_reason_type) == 0
|
1321
|
-
else finish_reason_type
|
1322
|
-
),
|
1386
|
+
finish_reason=finish_reason_type,
|
1323
1387
|
matched_stop=(
|
1324
1388
|
finish_reason["matched"]
|
1325
1389
|
if finish_reason and "matched" in finish_reason
|
@@ -1329,6 +1393,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1329
1393
|
)
|
1330
1394
|
chunk = ChatCompletionStreamResponse(
|
1331
1395
|
id=content["meta_info"]["id"],
|
1396
|
+
created=created,
|
1332
1397
|
choices=[choice_data],
|
1333
1398
|
model=request.model,
|
1334
1399
|
)
|
@@ -1354,16 +1419,16 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1354
1419
|
if reasoning_text:
|
1355
1420
|
choice_data = ChatCompletionResponseStreamChoice(
|
1356
1421
|
index=index,
|
1357
|
-
delta=DeltaMessage(
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
and len(finish_reason_type) == 0
|
1362
|
-
else finish_reason_type
|
1422
|
+
delta=DeltaMessage(
|
1423
|
+
reasoning_content=(
|
1424
|
+
reasoning_text if reasoning_text else None
|
1425
|
+
)
|
1363
1426
|
),
|
1427
|
+
finish_reason=finish_reason_type,
|
1364
1428
|
)
|
1365
1429
|
chunk = ChatCompletionStreamResponse(
|
1366
1430
|
id=content["meta_info"]["id"],
|
1431
|
+
created=created,
|
1367
1432
|
choices=[choice_data],
|
1368
1433
|
model=request.model,
|
1369
1434
|
)
|
@@ -1388,16 +1453,14 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1388
1453
|
if normal_text:
|
1389
1454
|
choice_data = ChatCompletionResponseStreamChoice(
|
1390
1455
|
index=index,
|
1391
|
-
delta=DeltaMessage(
|
1392
|
-
|
1393
|
-
None
|
1394
|
-
if finish_reason_type
|
1395
|
-
and len(finish_reason_type) == 0
|
1396
|
-
else finish_reason_type
|
1456
|
+
delta=DeltaMessage(
|
1457
|
+
content=normal_text if normal_text else None
|
1397
1458
|
),
|
1459
|
+
finish_reason=finish_reason_type,
|
1398
1460
|
)
|
1399
1461
|
chunk = ChatCompletionStreamResponse(
|
1400
1462
|
id=content["meta_info"]["id"],
|
1463
|
+
created=created,
|
1401
1464
|
choices=[choice_data],
|
1402
1465
|
model=request.model,
|
1403
1466
|
)
|
@@ -1407,11 +1470,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1407
1470
|
for call_item in calls:
|
1408
1471
|
# transform call_item -> FunctionResponse + ToolCall
|
1409
1472
|
|
1410
|
-
if
|
1411
|
-
content["meta_info"]["finish_reason"]
|
1412
|
-
and content["meta_info"]["finish_reason"]["type"]
|
1413
|
-
== "stop"
|
1414
|
-
):
|
1473
|
+
if finish_reason_type == "stop":
|
1415
1474
|
latest_delta_len = 0
|
1416
1475
|
if isinstance(call_item.parameters, str):
|
1417
1476
|
latest_delta_len = len(call_item.parameters)
|
@@ -1432,6 +1491,8 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1432
1491
|
)
|
1433
1492
|
call_item.parameters = remaining_call
|
1434
1493
|
|
1494
|
+
finish_reason_type = "tool_calls"
|
1495
|
+
|
1435
1496
|
tool_call = ToolCall(
|
1436
1497
|
id=str(call_item.tool_index),
|
1437
1498
|
function=FunctionResponse(
|
@@ -1441,13 +1502,17 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1441
1502
|
)
|
1442
1503
|
choice_data = ChatCompletionResponseStreamChoice(
|
1443
1504
|
index=index,
|
1444
|
-
delta=DeltaMessage(
|
1445
|
-
|
1446
|
-
|
1447
|
-
|
1505
|
+
delta=DeltaMessage(tool_calls=[tool_call]),
|
1506
|
+
finish_reason=(
|
1507
|
+
None
|
1508
|
+
if request.stream_options
|
1509
|
+
and request.stream_options.include_usage
|
1510
|
+
else finish_reason_type
|
1511
|
+
), # additional chunk will be return
|
1448
1512
|
)
|
1449
1513
|
chunk = ChatCompletionStreamResponse(
|
1450
1514
|
id=content["meta_info"]["id"],
|
1515
|
+
created=created,
|
1451
1516
|
choices=[choice_data],
|
1452
1517
|
model=request.model,
|
1453
1518
|
)
|
@@ -1458,29 +1523,44 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1458
1523
|
|
1459
1524
|
else:
|
1460
1525
|
# No tool calls => just treat this as normal text
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
else
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1478
|
-
|
1479
|
-
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1526
|
+
if delta or not (
|
1527
|
+
request.stream_options
|
1528
|
+
and request.stream_options.include_usage
|
1529
|
+
):
|
1530
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
1531
|
+
index=index,
|
1532
|
+
delta=DeltaMessage(content=delta if delta else None),
|
1533
|
+
finish_reason=(
|
1534
|
+
None
|
1535
|
+
if request.stream_options
|
1536
|
+
and request.stream_options.include_usage
|
1537
|
+
else finish_reason_type
|
1538
|
+
),
|
1539
|
+
matched_stop=(
|
1540
|
+
finish_reason["matched"]
|
1541
|
+
if finish_reason and "matched" in finish_reason
|
1542
|
+
else None
|
1543
|
+
),
|
1544
|
+
logprobs=choice_logprobs,
|
1545
|
+
)
|
1546
|
+
chunk = ChatCompletionStreamResponse(
|
1547
|
+
id=content["meta_info"]["id"],
|
1548
|
+
created=created,
|
1549
|
+
choices=[choice_data],
|
1550
|
+
model=request.model,
|
1551
|
+
)
|
1552
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
1553
|
+
stream_buffers[index] = new_stream_buffer
|
1554
|
+
is_firsts[index] = is_first
|
1555
|
+
if finish_reason_type == "stop" and request.tool_choice != "none":
|
1556
|
+
parser = FunctionCallParser(
|
1557
|
+
tools=request.tools,
|
1558
|
+
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
1559
|
+
)
|
1560
|
+
if parser.has_tool_call(new_stream_buffer):
|
1561
|
+
# if the stream ends with empty string after tool calls
|
1562
|
+
finish_reason_type = "tool_calls"
|
1563
|
+
|
1484
1564
|
if request.stream_options and request.stream_options.include_usage:
|
1485
1565
|
total_prompt_tokens = sum(
|
1486
1566
|
tokens
|
@@ -1490,22 +1570,37 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1490
1570
|
total_completion_tokens = sum(
|
1491
1571
|
tokens for tokens in completion_tokens.values()
|
1492
1572
|
)
|
1573
|
+
cache_report = tokenizer_manager.server_args.enable_cache_report
|
1574
|
+
if cache_report:
|
1575
|
+
cached_tokens_sum = sum(
|
1576
|
+
tokens for tokens in cached_tokens.values()
|
1577
|
+
)
|
1578
|
+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
1579
|
+
else:
|
1580
|
+
prompt_tokens_details = None
|
1493
1581
|
usage = UsageInfo(
|
1494
1582
|
prompt_tokens=total_prompt_tokens,
|
1495
1583
|
completion_tokens=total_completion_tokens,
|
1496
1584
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
1585
|
+
prompt_tokens_details=prompt_tokens_details,
|
1497
1586
|
)
|
1498
1587
|
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1588
|
+
else:
|
1589
|
+
usage = None
|
1590
|
+
final_usage_chunk = ChatCompletionStreamResponse(
|
1591
|
+
id=content["meta_info"]["id"],
|
1592
|
+
created=created,
|
1593
|
+
choices=[
|
1594
|
+
ChatCompletionResponseStreamChoice(
|
1595
|
+
index=index,
|
1596
|
+
delta=DeltaMessage(),
|
1597
|
+
finish_reason=finish_reason_type,
|
1598
|
+
)
|
1599
|
+
],
|
1600
|
+
model=request.model,
|
1601
|
+
usage=usage,
|
1602
|
+
)
|
1603
|
+
yield f"data: {final_usage_chunk.model_dump_json()}\n\n"
|
1509
1604
|
except ValueError as e:
|
1510
1605
|
error = create_streaming_error_response(str(e))
|
1511
1606
|
yield f"data: {error}\n\n"
|
@@ -1530,6 +1625,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1530
1625
|
response = v1_chat_generate_response(
|
1531
1626
|
request,
|
1532
1627
|
ret,
|
1628
|
+
created,
|
1533
1629
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
1534
1630
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
1535
1631
|
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
|
@@ -1557,18 +1653,19 @@ def v1_embedding_request(all_requests, tokenizer_manager):
|
|
1557
1653
|
elif isinstance(prompt, list) and isinstance(
|
1558
1654
|
prompt[0], MultimodalEmbeddingInput
|
1559
1655
|
):
|
1560
|
-
assert (
|
1561
|
-
chat_template_name is not None
|
1562
|
-
), "chat_template_name is required for multimodal inputs"
|
1563
1656
|
texts = []
|
1564
1657
|
images = []
|
1565
1658
|
for item in prompt:
|
1566
|
-
|
1659
|
+
# TODO simply use padding for text, we should use a better way to handle this
|
1660
|
+
texts.append(item.text if item.text is not None else "padding")
|
1567
1661
|
images.append(item.image if item.image is not None else None)
|
1568
|
-
convs = generate_embedding_convs(texts, images, chat_template_name)
|
1569
1662
|
generate_prompts = []
|
1570
|
-
|
1571
|
-
|
1663
|
+
if chat_template_name is not None:
|
1664
|
+
convs = generate_embedding_convs(texts, images, chat_template_name)
|
1665
|
+
for conv in convs:
|
1666
|
+
generate_prompts.append(conv.get_prompt())
|
1667
|
+
else:
|
1668
|
+
generate_prompts = texts
|
1572
1669
|
if len(generate_prompts) == 1:
|
1573
1670
|
prompt_kwargs = {"text": generate_prompts[0], "image_data": images[0]}
|
1574
1671
|
else:
|