sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +3 -1
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +667 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +63 -11
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/parallel_state.py +10 -3
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +71 -12
- sglang/srt/function_call_parser.py +164 -54
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +295 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +62 -23
- sglang/srt/layers/elementwise.py +411 -0
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +26 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/router.py +342 -0
- sglang/srt/layers/moe/topk.py +31 -18
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +184 -126
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +24 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -9
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +66 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/layers.py +68 -0
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +47 -23
- sglang/srt/lora/mem_pool.py +110 -51
- sglang/srt/lora/utils.py +12 -1
- sglang/srt/managers/cache_controller.py +4 -5
- sglang/srt/managers/data_parallel_controller.py +31 -9
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +39 -3
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +134 -31
- sglang/srt/managers/scheduler.py +325 -38
- sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -23
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +27 -8
- sglang/srt/mem_cache/memory_pool.py +258 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +85 -28
- sglang/srt/model_executor/forward_batch_info.py +81 -15
- sglang/srt/model_executor/model_runner.py +70 -6
- sglang/srt/model_loader/loader.py +160 -2
- sglang/srt/model_loader/weight_utils.py +45 -0
- sglang/srt/models/deepseek_janus_pro.py +29 -86
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +326 -192
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +684 -0
- sglang/srt/models/gemma3_mm.py +462 -0
- sglang/srt/models/grok.py +374 -119
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +145 -47
- sglang/srt/openai_api/protocol.py +23 -2
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +104 -14
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +139 -53
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +182 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +2 -0
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +55 -4
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
sglang/srt/openai_api/adapter.py
CHANGED
@@ -20,19 +20,16 @@ import os
|
|
20
20
|
import time
|
21
21
|
import uuid
|
22
22
|
from http import HTTPStatus
|
23
|
-
from typing import Dict, List
|
23
|
+
from typing import Any, Dict, List, Set
|
24
24
|
|
25
25
|
from fastapi import HTTPException, Request, UploadFile
|
26
26
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# outlines.integrations.utils
|
34
|
-
from outlines.integrations.utils import convert_json_schema_to_str
|
35
|
-
|
29
|
+
from sglang.srt.code_completion_parser import (
|
30
|
+
generate_completion_prompt_from_request,
|
31
|
+
is_completion_template_defined,
|
32
|
+
)
|
36
33
|
from sglang.srt.conversation import (
|
37
34
|
Conversation,
|
38
35
|
SeparatorStyle,
|
@@ -41,7 +38,7 @@ from sglang.srt.conversation import (
|
|
41
38
|
generate_embedding_convs,
|
42
39
|
register_conv_template,
|
43
40
|
)
|
44
|
-
from sglang.srt.function_call_parser import
|
41
|
+
from sglang.srt.function_call_parser import FunctionCallParser
|
45
42
|
from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
|
46
43
|
from sglang.srt.openai_api.protocol import (
|
47
44
|
BatchRequest,
|
@@ -75,7 +72,7 @@ from sglang.srt.openai_api.protocol import (
|
|
75
72
|
UsageInfo,
|
76
73
|
)
|
77
74
|
from sglang.srt.reasoning_parser import ReasoningParser
|
78
|
-
from sglang.utils import get_exception_traceback
|
75
|
+
from sglang.utils import convert_json_schema_to_str, get_exception_traceback
|
79
76
|
|
80
77
|
logger = logging.getLogger(__name__)
|
81
78
|
|
@@ -310,6 +307,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
310
307
|
)
|
311
308
|
|
312
309
|
try:
|
310
|
+
created = int(time.time())
|
313
311
|
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
314
312
|
if not isinstance(ret, list):
|
315
313
|
ret = [ret]
|
@@ -317,13 +315,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
317
315
|
responses = v1_chat_generate_response(
|
318
316
|
request,
|
319
317
|
ret,
|
318
|
+
created,
|
320
319
|
to_file=True,
|
321
320
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
322
321
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
323
322
|
)
|
324
323
|
else:
|
325
324
|
responses = v1_generate_response(
|
326
|
-
request,
|
325
|
+
request,
|
326
|
+
ret,
|
327
|
+
tokenizer_manager,
|
328
|
+
created,
|
329
|
+
to_file=True,
|
330
|
+
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
327
331
|
)
|
328
332
|
|
329
333
|
except Exception as e:
|
@@ -504,7 +508,11 @@ def v1_generate_request(
|
|
504
508
|
"To compute logprobs of input prompt, please use the native /generate API."
|
505
509
|
)
|
506
510
|
|
507
|
-
|
511
|
+
prompt = request.prompt
|
512
|
+
if is_completion_template_defined():
|
513
|
+
prompt = generate_completion_prompt_from_request(request)
|
514
|
+
prompts.append(prompt)
|
515
|
+
|
508
516
|
lora_paths.append(request.lora_path)
|
509
517
|
if request.echo and request.logprobs:
|
510
518
|
current_logprob_start_len = 0
|
@@ -569,7 +577,9 @@ def v1_generate_request(
|
|
569
577
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
570
578
|
|
571
579
|
|
572
|
-
def v1_generate_response(
|
580
|
+
def v1_generate_response(
|
581
|
+
request, ret, tokenizer_manager, created, to_file=False, cache_report=False
|
582
|
+
):
|
573
583
|
choices = []
|
574
584
|
echo = False
|
575
585
|
|
@@ -667,7 +677,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
667
677
|
# remain the same but if needed we can change that
|
668
678
|
"id": ret[i]["meta_info"]["id"],
|
669
679
|
"object": "text_completion",
|
670
|
-
"created":
|
680
|
+
"created": created,
|
671
681
|
"model": request[i].model,
|
672
682
|
"choices": choice,
|
673
683
|
"usage": {
|
@@ -686,14 +696,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
686
696
|
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
687
697
|
)
|
688
698
|
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
699
|
+
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
689
700
|
response = CompletionResponse(
|
690
701
|
id=ret[0]["meta_info"]["id"],
|
691
702
|
model=request.model,
|
703
|
+
created=created,
|
692
704
|
choices=choices,
|
693
705
|
usage=UsageInfo(
|
694
706
|
prompt_tokens=prompt_tokens,
|
695
707
|
completion_tokens=completion_tokens,
|
696
708
|
total_tokens=prompt_tokens + completion_tokens,
|
709
|
+
prompt_tokens_details=(
|
710
|
+
{"cached_tokens": cached_tokens} if cache_report else None
|
711
|
+
),
|
697
712
|
),
|
698
713
|
)
|
699
714
|
return response
|
@@ -702,6 +717,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
702
717
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
703
718
|
request_json = await raw_request.json()
|
704
719
|
all_requests = [CompletionRequest(**request_json)]
|
720
|
+
created = int(time.time())
|
705
721
|
adapted_request, request = v1_generate_request(all_requests)
|
706
722
|
|
707
723
|
if adapted_request.stream:
|
@@ -711,6 +727,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
711
727
|
n_prev_tokens = {}
|
712
728
|
prompt_tokens = {}
|
713
729
|
completion_tokens = {}
|
730
|
+
cached_tokens = {}
|
731
|
+
|
714
732
|
try:
|
715
733
|
async for content in tokenizer_manager.generate_request(
|
716
734
|
adapted_request, raw_request
|
@@ -723,6 +741,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
723
741
|
text = content["text"]
|
724
742
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
725
743
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
744
|
+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
726
745
|
|
727
746
|
if not stream_buffer: # The first chunk
|
728
747
|
if request.echo:
|
@@ -795,6 +814,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
795
814
|
)
|
796
815
|
chunk = CompletionStreamResponse(
|
797
816
|
id=content["meta_info"]["id"],
|
817
|
+
created=created,
|
798
818
|
object="text_completion",
|
799
819
|
choices=[choice_data],
|
800
820
|
model=request.model,
|
@@ -813,14 +833,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
813
833
|
total_completion_tokens = sum(
|
814
834
|
tokens for tokens in completion_tokens.values()
|
815
835
|
)
|
836
|
+
cache_report = tokenizer_manager.server_args.enable_cache_report
|
837
|
+
if cache_report:
|
838
|
+
cached_tokens_sum = sum(
|
839
|
+
tokens for tokens in cached_tokens.values()
|
840
|
+
)
|
841
|
+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
842
|
+
else:
|
843
|
+
prompt_tokens_details = None
|
816
844
|
usage = UsageInfo(
|
817
845
|
prompt_tokens=total_prompt_tokens,
|
818
846
|
completion_tokens=total_completion_tokens,
|
819
847
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
848
|
+
prompt_tokens_details=prompt_tokens_details,
|
820
849
|
)
|
821
850
|
|
822
851
|
final_usage_chunk = CompletionStreamResponse(
|
823
852
|
id=content["meta_info"]["id"],
|
853
|
+
created=created,
|
824
854
|
choices=[],
|
825
855
|
model=request.model,
|
826
856
|
usage=usage,
|
@@ -851,7 +881,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
851
881
|
if not isinstance(ret, list):
|
852
882
|
ret = [ret]
|
853
883
|
|
854
|
-
response = v1_generate_response(
|
884
|
+
response = v1_generate_response(
|
885
|
+
request,
|
886
|
+
ret,
|
887
|
+
tokenizer_manager,
|
888
|
+
created,
|
889
|
+
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
890
|
+
)
|
855
891
|
return response
|
856
892
|
|
857
893
|
|
@@ -863,6 +899,7 @@ def v1_chat_generate_request(
|
|
863
899
|
input_ids = []
|
864
900
|
sampling_params_list = []
|
865
901
|
image_data_list = []
|
902
|
+
audio_data_list = []
|
866
903
|
return_logprobs = []
|
867
904
|
logprob_start_lens = []
|
868
905
|
top_logprobs_nums = []
|
@@ -876,7 +913,9 @@ def v1_chat_generate_request(
|
|
876
913
|
# - prompt: The full prompt string.
|
877
914
|
# - stop: Custom stop tokens.
|
878
915
|
# - image_data: None or a list of image strings (URLs or base64 strings).
|
916
|
+
# - audio_data: None or a list of audio strings (URLs).
|
879
917
|
# None skips any image processing in GenerateReqInput.
|
918
|
+
strict_tag = None
|
880
919
|
if not isinstance(request.messages, str):
|
881
920
|
# Apply chat template and its stop strings.
|
882
921
|
tools = None
|
@@ -891,6 +930,10 @@ def v1_chat_generate_request(
|
|
891
930
|
else:
|
892
931
|
tools = [item.function.model_dump() for item in request.tools]
|
893
932
|
|
933
|
+
tool_call_parser = tokenizer_manager.server_args.tool_call_parser
|
934
|
+
parser = FunctionCallParser(request.tools, tool_call_parser)
|
935
|
+
strict_tag = parser.get_structure_tag()
|
936
|
+
|
894
937
|
if chat_template_name is None:
|
895
938
|
openai_compatible_messages = []
|
896
939
|
for message in request.messages:
|
@@ -920,7 +963,7 @@ def v1_chat_generate_request(
|
|
920
963
|
)
|
921
964
|
except:
|
922
965
|
# This except branch will be triggered when the chosen model
|
923
|
-
# has a different tools input format that is not
|
966
|
+
# has a different tools input format that is not compatible
|
924
967
|
# with openAI's apply_chat_template tool_call format, like Mistral.
|
925
968
|
tools = [t if "function" in t else {"function": t} for t in tools]
|
926
969
|
prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
|
@@ -940,11 +983,13 @@ def v1_chat_generate_request(
|
|
940
983
|
prompt_ids += encoded
|
941
984
|
stop = request.stop
|
942
985
|
image_data = None
|
986
|
+
audio_data = None
|
943
987
|
modalities = []
|
944
988
|
else:
|
945
989
|
conv = generate_chat_conv(request, chat_template_name)
|
946
990
|
prompt = conv.get_prompt()
|
947
991
|
image_data = conv.image_data
|
992
|
+
audio_data = conv.audio_data
|
948
993
|
modalities = conv.modalities
|
949
994
|
stop = conv.stop_str or []
|
950
995
|
if request.stop:
|
@@ -958,6 +1003,7 @@ def v1_chat_generate_request(
|
|
958
1003
|
prompt_ids = request.messages
|
959
1004
|
stop = request.stop
|
960
1005
|
image_data = None
|
1006
|
+
audio_data = None
|
961
1007
|
modalities = []
|
962
1008
|
input_ids.append(prompt_ids)
|
963
1009
|
return_logprobs.append(request.logprobs)
|
@@ -995,9 +1041,26 @@ def v1_chat_generate_request(
|
|
995
1041
|
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
996
1042
|
request.response_format.model_dump(by_alias=True)
|
997
1043
|
)
|
1044
|
+
|
1045
|
+
if strict_tag is not None:
|
1046
|
+
if (
|
1047
|
+
sampling_params.get("regex")
|
1048
|
+
or sampling_params.get("ebnf")
|
1049
|
+
or sampling_params.get("structural_tag")
|
1050
|
+
or sampling_params.get("json_schema")
|
1051
|
+
):
|
1052
|
+
logger.warning(
|
1053
|
+
"Constrained decoding is not compatible with tool calls."
|
1054
|
+
)
|
1055
|
+
else:
|
1056
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
1057
|
+
strict_tag.model_dump(by_alias=True)
|
1058
|
+
)
|
1059
|
+
|
998
1060
|
sampling_params_list.append(sampling_params)
|
999
1061
|
|
1000
1062
|
image_data_list.append(image_data)
|
1063
|
+
audio_data_list.append(audio_data)
|
1001
1064
|
modalities_list.append(modalities)
|
1002
1065
|
if len(all_requests) == 1:
|
1003
1066
|
if isinstance(input_ids[0], str):
|
@@ -1006,6 +1069,7 @@ def v1_chat_generate_request(
|
|
1006
1069
|
prompt_kwargs = {"input_ids": input_ids[0]}
|
1007
1070
|
sampling_params_list = sampling_params_list[0]
|
1008
1071
|
image_data_list = image_data_list[0]
|
1072
|
+
audio_data_list = audio_data_list[0]
|
1009
1073
|
return_logprobs = return_logprobs[0]
|
1010
1074
|
logprob_start_lens = logprob_start_lens[0]
|
1011
1075
|
top_logprobs_nums = top_logprobs_nums[0]
|
@@ -1020,6 +1084,7 @@ def v1_chat_generate_request(
|
|
1020
1084
|
adapted_request = GenerateReqInput(
|
1021
1085
|
**prompt_kwargs,
|
1022
1086
|
image_data=image_data_list,
|
1087
|
+
audio_data=audio_data_list,
|
1023
1088
|
sampling_params=sampling_params_list,
|
1024
1089
|
return_logprob=return_logprobs,
|
1025
1090
|
logprob_start_len=logprob_start_lens,
|
@@ -1037,6 +1102,7 @@ def v1_chat_generate_request(
|
|
1037
1102
|
def v1_chat_generate_response(
|
1038
1103
|
request,
|
1039
1104
|
ret,
|
1105
|
+
created,
|
1040
1106
|
to_file=False,
|
1041
1107
|
cache_report=False,
|
1042
1108
|
tool_call_parser=None,
|
@@ -1115,27 +1181,29 @@ def v1_chat_generate_response(
|
|
1115
1181
|
else:
|
1116
1182
|
reasoning_text = None
|
1117
1183
|
|
1118
|
-
if tool_choice != "none" and
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1184
|
+
if tool_choice != "none" and tools:
|
1185
|
+
parser = FunctionCallParser(tools, tool_call_parser)
|
1186
|
+
if parser.has_tool_call(text):
|
1187
|
+
if finish_reason["type"] == "stop":
|
1188
|
+
finish_reason["type"] = "tool_calls"
|
1189
|
+
finish_reason["matched"] = None
|
1190
|
+
try:
|
1191
|
+
text, call_info_list = parser.parse_non_stream(text)
|
1192
|
+
tool_calls = [
|
1193
|
+
ToolCall(
|
1194
|
+
id=str(call_info.tool_index),
|
1195
|
+
function=FunctionResponse(
|
1196
|
+
name=call_info.name, arguments=call_info.parameters
|
1197
|
+
),
|
1198
|
+
)
|
1199
|
+
for call_info in call_info_list
|
1200
|
+
]
|
1201
|
+
except Exception as e:
|
1202
|
+
logger.error(f"Exception: {e}")
|
1203
|
+
return create_error_response(
|
1204
|
+
HTTPStatus.BAD_REQUEST,
|
1205
|
+
"Failed to parse fc related info to json format!",
|
1130
1206
|
)
|
1131
|
-
for call_info in call_info_list
|
1132
|
-
]
|
1133
|
-
except Exception as e:
|
1134
|
-
logger.error(f"Exception: {e}")
|
1135
|
-
return create_error_response(
|
1136
|
-
HTTPStatus.BAD_REQUEST,
|
1137
|
-
"Failed to parse fc related info to json format!",
|
1138
|
-
)
|
1139
1207
|
|
1140
1208
|
if to_file:
|
1141
1209
|
# to make the choice data json serializable
|
@@ -1143,9 +1211,9 @@ def v1_chat_generate_response(
|
|
1143
1211
|
"index": 0,
|
1144
1212
|
"message": {
|
1145
1213
|
"role": "assistant",
|
1146
|
-
"content": text if
|
1214
|
+
"content": text if text else None,
|
1147
1215
|
"tool_calls": tool_calls,
|
1148
|
-
"reasoning_content": reasoning_text,
|
1216
|
+
"reasoning_content": reasoning_text if reasoning_text else None,
|
1149
1217
|
},
|
1150
1218
|
"logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
|
1151
1219
|
"finish_reason": (finish_reason["type"] if finish_reason else ""),
|
@@ -1160,9 +1228,9 @@ def v1_chat_generate_response(
|
|
1160
1228
|
index=idx,
|
1161
1229
|
message=ChatMessage(
|
1162
1230
|
role="assistant",
|
1163
|
-
content=text if
|
1231
|
+
content=text if text else None,
|
1164
1232
|
tool_calls=tool_calls,
|
1165
|
-
reasoning_content=reasoning_text,
|
1233
|
+
reasoning_content=reasoning_text if reasoning_text else None,
|
1166
1234
|
),
|
1167
1235
|
logprobs=choice_logprobs,
|
1168
1236
|
finish_reason=(finish_reason["type"] if finish_reason else ""),
|
@@ -1186,7 +1254,7 @@ def v1_chat_generate_response(
|
|
1186
1254
|
# remain the same but if needed we can change that
|
1187
1255
|
"id": ret[i]["meta_info"]["id"],
|
1188
1256
|
"object": "chat.completion",
|
1189
|
-
"created":
|
1257
|
+
"created": created,
|
1190
1258
|
"model": request[i].model,
|
1191
1259
|
"choices": choice,
|
1192
1260
|
"usage": {
|
@@ -1208,6 +1276,7 @@ def v1_chat_generate_response(
|
|
1208
1276
|
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
1209
1277
|
response = ChatCompletionResponse(
|
1210
1278
|
id=ret[0]["meta_info"]["id"],
|
1279
|
+
created=created,
|
1211
1280
|
model=request.model,
|
1212
1281
|
choices=choices,
|
1213
1282
|
usage=UsageInfo(
|
@@ -1222,9 +1291,12 @@ def v1_chat_generate_response(
|
|
1222
1291
|
return response
|
1223
1292
|
|
1224
1293
|
|
1225
|
-
async def v1_chat_completions(
|
1294
|
+
async def v1_chat_completions(
|
1295
|
+
tokenizer_manager, raw_request: Request, cache_report=False
|
1296
|
+
):
|
1226
1297
|
request_json = await raw_request.json()
|
1227
1298
|
all_requests = [ChatCompletionRequest(**request_json)]
|
1299
|
+
created = int(time.time())
|
1228
1300
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
1229
1301
|
|
1230
1302
|
if adapted_request.stream:
|
@@ -1237,6 +1309,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1237
1309
|
n_prev_tokens = {}
|
1238
1310
|
prompt_tokens = {}
|
1239
1311
|
completion_tokens = {}
|
1312
|
+
cached_tokens = {}
|
1240
1313
|
try:
|
1241
1314
|
async for content in tokenizer_manager.generate_request(
|
1242
1315
|
adapted_request, raw_request
|
@@ -1250,6 +1323,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1250
1323
|
|
1251
1324
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
1252
1325
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
1326
|
+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
1253
1327
|
if request.logprobs:
|
1254
1328
|
logprobs = to_openai_style_logprobs(
|
1255
1329
|
output_token_logprobs=content["meta_info"][
|
@@ -1307,9 +1381,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1307
1381
|
tokenizer_manager.server_args.reasoning_parser
|
1308
1382
|
and request.separate_reasoning
|
1309
1383
|
):
|
1310
|
-
delta = DeltaMessage(
|
1384
|
+
delta = DeltaMessage(
|
1385
|
+
role="assistant", reasoning_content=None
|
1386
|
+
)
|
1311
1387
|
else:
|
1312
|
-
delta = DeltaMessage(role="assistant", content=
|
1388
|
+
delta = DeltaMessage(role="assistant", content=None)
|
1313
1389
|
choice_data = ChatCompletionResponseStreamChoice(
|
1314
1390
|
index=index,
|
1315
1391
|
delta=delta,
|
@@ -1327,6 +1403,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1327
1403
|
)
|
1328
1404
|
chunk = ChatCompletionStreamResponse(
|
1329
1405
|
id=content["meta_info"]["id"],
|
1406
|
+
created=created,
|
1330
1407
|
choices=[choice_data],
|
1331
1408
|
model=request.model,
|
1332
1409
|
)
|
@@ -1352,7 +1429,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1352
1429
|
if reasoning_text:
|
1353
1430
|
choice_data = ChatCompletionResponseStreamChoice(
|
1354
1431
|
index=index,
|
1355
|
-
delta=DeltaMessage(
|
1432
|
+
delta=DeltaMessage(
|
1433
|
+
reasoning_content=(
|
1434
|
+
reasoning_text if reasoning_text else None
|
1435
|
+
)
|
1436
|
+
),
|
1356
1437
|
finish_reason=(
|
1357
1438
|
None
|
1358
1439
|
if finish_reason_type
|
@@ -1362,6 +1443,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1362
1443
|
)
|
1363
1444
|
chunk = ChatCompletionStreamResponse(
|
1364
1445
|
id=content["meta_info"]["id"],
|
1446
|
+
created=created,
|
1365
1447
|
choices=[choice_data],
|
1366
1448
|
model=request.model,
|
1367
1449
|
)
|
@@ -1386,7 +1468,9 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1386
1468
|
if normal_text:
|
1387
1469
|
choice_data = ChatCompletionResponseStreamChoice(
|
1388
1470
|
index=index,
|
1389
|
-
delta=DeltaMessage(
|
1471
|
+
delta=DeltaMessage(
|
1472
|
+
content=normal_text if normal_text else None
|
1473
|
+
),
|
1390
1474
|
finish_reason=(
|
1391
1475
|
None
|
1392
1476
|
if finish_reason_type
|
@@ -1396,6 +1480,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1396
1480
|
)
|
1397
1481
|
chunk = ChatCompletionStreamResponse(
|
1398
1482
|
id=content["meta_info"]["id"],
|
1483
|
+
created=created,
|
1399
1484
|
choices=[choice_data],
|
1400
1485
|
model=request.model,
|
1401
1486
|
)
|
@@ -1446,6 +1531,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1446
1531
|
)
|
1447
1532
|
chunk = ChatCompletionStreamResponse(
|
1448
1533
|
id=content["meta_info"]["id"],
|
1534
|
+
created=created,
|
1449
1535
|
choices=[choice_data],
|
1450
1536
|
model=request.model,
|
1451
1537
|
)
|
@@ -1458,7 +1544,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1458
1544
|
# No tool calls => just treat this as normal text
|
1459
1545
|
choice_data = ChatCompletionResponseStreamChoice(
|
1460
1546
|
index=index,
|
1461
|
-
delta=DeltaMessage(content=delta),
|
1547
|
+
delta=DeltaMessage(content=delta if delta else None),
|
1462
1548
|
finish_reason=(
|
1463
1549
|
None
|
1464
1550
|
if finish_reason_type and len(finish_reason_type) == 0
|
@@ -1473,6 +1559,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1473
1559
|
)
|
1474
1560
|
chunk = ChatCompletionStreamResponse(
|
1475
1561
|
id=content["meta_info"]["id"],
|
1562
|
+
created=created,
|
1476
1563
|
choices=[choice_data],
|
1477
1564
|
model=request.model,
|
1478
1565
|
)
|
@@ -1488,14 +1575,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1488
1575
|
total_completion_tokens = sum(
|
1489
1576
|
tokens for tokens in completion_tokens.values()
|
1490
1577
|
)
|
1578
|
+
cache_report = tokenizer_manager.server_args.enable_cache_report
|
1579
|
+
if cache_report:
|
1580
|
+
cached_tokens_sum = sum(
|
1581
|
+
tokens for tokens in cached_tokens.values()
|
1582
|
+
)
|
1583
|
+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
1584
|
+
else:
|
1585
|
+
prompt_tokens_details = None
|
1491
1586
|
usage = UsageInfo(
|
1492
1587
|
prompt_tokens=total_prompt_tokens,
|
1493
1588
|
completion_tokens=total_completion_tokens,
|
1494
1589
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
1590
|
+
prompt_tokens_details=prompt_tokens_details,
|
1495
1591
|
)
|
1496
1592
|
|
1497
1593
|
final_usage_chunk = ChatCompletionStreamResponse(
|
1498
1594
|
id=content["meta_info"]["id"],
|
1595
|
+
created=created,
|
1499
1596
|
choices=[],
|
1500
1597
|
model=request.model,
|
1501
1598
|
usage=usage,
|
@@ -1528,6 +1625,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1528
1625
|
response = v1_chat_generate_response(
|
1529
1626
|
request,
|
1530
1627
|
ret,
|
1628
|
+
created,
|
1531
1629
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
1532
1630
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
1533
1631
|
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
|
@@ -16,7 +16,7 @@
|
|
16
16
|
import time
|
17
17
|
from typing import Dict, List, Optional, Union
|
18
18
|
|
19
|
-
from pydantic import BaseModel, Field
|
19
|
+
from pydantic import BaseModel, Field, root_validator
|
20
20
|
from typing_extensions import Literal
|
21
21
|
|
22
22
|
|
@@ -227,14 +227,25 @@ class ChatCompletionMessageContentImageURL(BaseModel):
|
|
227
227
|
detail: Optional[Literal["auto", "low", "high"]] = "auto"
|
228
228
|
|
229
229
|
|
230
|
+
class ChatCompletionMessageContentAudioURL(BaseModel):
|
231
|
+
url: str
|
232
|
+
|
233
|
+
|
230
234
|
class ChatCompletionMessageContentImagePart(BaseModel):
|
231
235
|
type: Literal["image_url"]
|
232
236
|
image_url: ChatCompletionMessageContentImageURL
|
233
237
|
modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
|
234
238
|
|
235
239
|
|
240
|
+
class ChatCompletionMessageContentAudioPart(BaseModel):
|
241
|
+
type: Literal["audio_url"]
|
242
|
+
audio_url: ChatCompletionMessageContentAudioURL
|
243
|
+
|
244
|
+
|
236
245
|
ChatCompletionMessageContentPart = Union[
|
237
|
-
ChatCompletionMessageContentTextPart,
|
246
|
+
ChatCompletionMessageContentTextPart,
|
247
|
+
ChatCompletionMessageContentImagePart,
|
248
|
+
ChatCompletionMessageContentAudioPart,
|
238
249
|
]
|
239
250
|
|
240
251
|
|
@@ -276,6 +287,7 @@ class Function(BaseModel):
|
|
276
287
|
description: Optional[str] = Field(default=None, examples=[None])
|
277
288
|
name: Optional[str] = None
|
278
289
|
parameters: Optional[object] = None
|
290
|
+
strict: bool = False
|
279
291
|
|
280
292
|
|
281
293
|
class Tool(BaseModel):
|
@@ -323,6 +335,15 @@ class ChatCompletionRequest(BaseModel):
|
|
323
335
|
default="auto", examples=["none"]
|
324
336
|
) # noqa
|
325
337
|
|
338
|
+
@root_validator(pre=True)
|
339
|
+
def set_tool_choice_default(cls, values):
|
340
|
+
if values.get("tool_choice") is None:
|
341
|
+
if values.get("tools") is None:
|
342
|
+
values["tool_choice"] = "none"
|
343
|
+
else:
|
344
|
+
values["tool_choice"] = "auto"
|
345
|
+
return values
|
346
|
+
|
326
347
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
327
348
|
top_k: int = -1
|
328
349
|
min_p: float = 0.0
|
@@ -306,7 +306,7 @@ class SamplingBatchInfo:
|
|
306
306
|
]:
|
307
307
|
self_val = getattr(self, item, None)
|
308
308
|
other_val = getattr(other, item, None)
|
309
|
-
setattr(self, item, torch.
|
309
|
+
setattr(self, item, torch.cat([self_val, other_val]))
|
310
310
|
|
311
311
|
self.is_all_greedy |= other.is_all_greedy
|
312
312
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
@@ -77,7 +77,7 @@ class SamplingParams:
|
|
77
77
|
self.custom_params = custom_params
|
78
78
|
|
79
79
|
# Process some special cases
|
80
|
-
if self.temperature < _SAMPLING_EPS:
|
80
|
+
if 0 <= self.temperature < _SAMPLING_EPS:
|
81
81
|
# top_k = 1 means greedy sampling
|
82
82
|
self.temperature = 1.0
|
83
83
|
self.top_k = 1
|
@@ -93,9 +93,9 @@ class SamplingParams:
|
|
93
93
|
raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
|
94
94
|
if not 0.0 <= self.min_p <= 1.0:
|
95
95
|
raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
|
96
|
-
if self.top_k <
|
96
|
+
if self.top_k < 1 or self.top_k == -1:
|
97
97
|
raise ValueError(
|
98
|
-
f"top_k must be -1 (disable)
|
98
|
+
f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
|
99
99
|
)
|
100
100
|
if not -2.0 <= self.frequency_penalty <= 2.0:
|
101
101
|
raise ValueError(
|
@@ -108,12 +108,12 @@ class SamplingParams:
|
|
108
108
|
)
|
109
109
|
if not 0.0 <= self.repetition_penalty <= 2.0:
|
110
110
|
raise ValueError(
|
111
|
-
"repetition_penalty must be in
|
111
|
+
"repetition_penalty must be in [0, 2], got "
|
112
112
|
f"{self.repetition_penalty}."
|
113
113
|
)
|
114
114
|
if not 0 <= self.min_new_tokens:
|
115
115
|
raise ValueError(
|
116
|
-
f"min_new_tokens must be in
|
116
|
+
f"min_new_tokens must be in [0, max_new_tokens], got "
|
117
117
|
f"{self.min_new_tokens}."
|
118
118
|
)
|
119
119
|
if self.max_new_tokens is not None:
|
@@ -123,7 +123,7 @@ class SamplingParams:
|
|
123
123
|
)
|
124
124
|
if not self.min_new_tokens <= self.max_new_tokens:
|
125
125
|
raise ValueError(
|
126
|
-
f"min_new_tokens must be in
|
126
|
+
f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
|
127
127
|
f"{self.min_new_tokens}."
|
128
128
|
)
|
129
129
|
grammars = [
|