sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +3 -1
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +667 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +63 -11
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/parallel_state.py +10 -3
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +71 -12
- sglang/srt/function_call_parser.py +133 -54
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +295 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +32 -21
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +25 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/topk.py +31 -18
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +184 -126
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +24 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -9
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +66 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/layers.py +68 -0
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +47 -23
- sglang/srt/lora/mem_pool.py +110 -51
- sglang/srt/lora/utils.py +12 -1
- sglang/srt/managers/cache_controller.py +2 -5
- sglang/srt/managers/data_parallel_controller.py +30 -8
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +39 -3
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +133 -30
- sglang/srt/managers/scheduler.py +273 -20
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -23
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +18 -7
- sglang/srt/mem_cache/memory_pool.py +255 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +27 -13
- sglang/srt/model_executor/forward_batch_info.py +68 -11
- sglang/srt/model_executor/model_runner.py +70 -6
- sglang/srt/model_loader/loader.py +160 -2
- sglang/srt/model_loader/weight_utils.py +45 -0
- sglang/srt/models/deepseek_janus_pro.py +29 -86
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +208 -77
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +684 -0
- sglang/srt/models/gemma3_mm.py +462 -0
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +124 -28
- sglang/srt/openai_api/protocol.py +23 -2
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +99 -9
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +139 -53
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +182 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +2 -0
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +55 -4
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +167 -123
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
sglang/srt/openai_api/adapter.py
CHANGED
@@ -20,19 +20,16 @@ import os
|
|
20
20
|
import time
|
21
21
|
import uuid
|
22
22
|
from http import HTTPStatus
|
23
|
-
from typing import Dict, List
|
23
|
+
from typing import Any, Dict, List, Set
|
24
24
|
|
25
25
|
from fastapi import HTTPException, Request, UploadFile
|
26
26
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
# outlines.integrations.utils
|
34
|
-
from outlines.integrations.utils import convert_json_schema_to_str
|
35
|
-
|
29
|
+
from sglang.srt.code_completion_parser import (
|
30
|
+
generate_completion_prompt_from_request,
|
31
|
+
is_completion_template_defined,
|
32
|
+
)
|
36
33
|
from sglang.srt.conversation import (
|
37
34
|
Conversation,
|
38
35
|
SeparatorStyle,
|
@@ -41,7 +38,7 @@ from sglang.srt.conversation import (
|
|
41
38
|
generate_embedding_convs,
|
42
39
|
register_conv_template,
|
43
40
|
)
|
44
|
-
from sglang.srt.function_call_parser import
|
41
|
+
from sglang.srt.function_call_parser import FunctionCallParser
|
45
42
|
from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
|
46
43
|
from sglang.srt.openai_api.protocol import (
|
47
44
|
BatchRequest,
|
@@ -75,7 +72,7 @@ from sglang.srt.openai_api.protocol import (
|
|
75
72
|
UsageInfo,
|
76
73
|
)
|
77
74
|
from sglang.srt.reasoning_parser import ReasoningParser
|
78
|
-
from sglang.utils import get_exception_traceback
|
75
|
+
from sglang.utils import convert_json_schema_to_str, get_exception_traceback
|
79
76
|
|
80
77
|
logger = logging.getLogger(__name__)
|
81
78
|
|
@@ -310,6 +307,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
310
307
|
)
|
311
308
|
|
312
309
|
try:
|
310
|
+
created = int(time.time())
|
313
311
|
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
314
312
|
if not isinstance(ret, list):
|
315
313
|
ret = [ret]
|
@@ -317,13 +315,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
317
315
|
responses = v1_chat_generate_response(
|
318
316
|
request,
|
319
317
|
ret,
|
318
|
+
created,
|
320
319
|
to_file=True,
|
321
320
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
322
321
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
323
322
|
)
|
324
323
|
else:
|
325
324
|
responses = v1_generate_response(
|
326
|
-
request,
|
325
|
+
request,
|
326
|
+
ret,
|
327
|
+
tokenizer_manager,
|
328
|
+
created,
|
329
|
+
to_file=True,
|
330
|
+
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
327
331
|
)
|
328
332
|
|
329
333
|
except Exception as e:
|
@@ -504,7 +508,11 @@ def v1_generate_request(
|
|
504
508
|
"To compute logprobs of input prompt, please use the native /generate API."
|
505
509
|
)
|
506
510
|
|
507
|
-
|
511
|
+
prompt = request.prompt
|
512
|
+
if is_completion_template_defined():
|
513
|
+
prompt = generate_completion_prompt_from_request(request)
|
514
|
+
prompts.append(prompt)
|
515
|
+
|
508
516
|
lora_paths.append(request.lora_path)
|
509
517
|
if request.echo and request.logprobs:
|
510
518
|
current_logprob_start_len = 0
|
@@ -569,7 +577,9 @@ def v1_generate_request(
|
|
569
577
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
570
578
|
|
571
579
|
|
572
|
-
def v1_generate_response(
|
580
|
+
def v1_generate_response(
|
581
|
+
request, ret, tokenizer_manager, created, to_file=False, cache_report=False
|
582
|
+
):
|
573
583
|
choices = []
|
574
584
|
echo = False
|
575
585
|
|
@@ -667,7 +677,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
667
677
|
# remain the same but if needed we can change that
|
668
678
|
"id": ret[i]["meta_info"]["id"],
|
669
679
|
"object": "text_completion",
|
670
|
-
"created":
|
680
|
+
"created": created,
|
671
681
|
"model": request[i].model,
|
672
682
|
"choices": choice,
|
673
683
|
"usage": {
|
@@ -686,14 +696,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
686
696
|
ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
|
687
697
|
)
|
688
698
|
completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
|
699
|
+
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
689
700
|
response = CompletionResponse(
|
690
701
|
id=ret[0]["meta_info"]["id"],
|
691
702
|
model=request.model,
|
703
|
+
created=created,
|
692
704
|
choices=choices,
|
693
705
|
usage=UsageInfo(
|
694
706
|
prompt_tokens=prompt_tokens,
|
695
707
|
completion_tokens=completion_tokens,
|
696
708
|
total_tokens=prompt_tokens + completion_tokens,
|
709
|
+
prompt_tokens_details=(
|
710
|
+
{"cached_tokens": cached_tokens} if cache_report else None
|
711
|
+
),
|
697
712
|
),
|
698
713
|
)
|
699
714
|
return response
|
@@ -702,6 +717,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
|
|
702
717
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
703
718
|
request_json = await raw_request.json()
|
704
719
|
all_requests = [CompletionRequest(**request_json)]
|
720
|
+
created = int(time.time())
|
705
721
|
adapted_request, request = v1_generate_request(all_requests)
|
706
722
|
|
707
723
|
if adapted_request.stream:
|
@@ -711,6 +727,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
711
727
|
n_prev_tokens = {}
|
712
728
|
prompt_tokens = {}
|
713
729
|
completion_tokens = {}
|
730
|
+
cached_tokens = {}
|
731
|
+
|
714
732
|
try:
|
715
733
|
async for content in tokenizer_manager.generate_request(
|
716
734
|
adapted_request, raw_request
|
@@ -723,6 +741,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
723
741
|
text = content["text"]
|
724
742
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
725
743
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
744
|
+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
726
745
|
|
727
746
|
if not stream_buffer: # The first chunk
|
728
747
|
if request.echo:
|
@@ -795,6 +814,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
795
814
|
)
|
796
815
|
chunk = CompletionStreamResponse(
|
797
816
|
id=content["meta_info"]["id"],
|
817
|
+
created=created,
|
798
818
|
object="text_completion",
|
799
819
|
choices=[choice_data],
|
800
820
|
model=request.model,
|
@@ -813,14 +833,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
813
833
|
total_completion_tokens = sum(
|
814
834
|
tokens for tokens in completion_tokens.values()
|
815
835
|
)
|
836
|
+
cache_report = tokenizer_manager.server_args.enable_cache_report
|
837
|
+
if cache_report:
|
838
|
+
cached_tokens_sum = sum(
|
839
|
+
tokens for tokens in cached_tokens.values()
|
840
|
+
)
|
841
|
+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
842
|
+
else:
|
843
|
+
prompt_tokens_details = None
|
816
844
|
usage = UsageInfo(
|
817
845
|
prompt_tokens=total_prompt_tokens,
|
818
846
|
completion_tokens=total_completion_tokens,
|
819
847
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
848
|
+
prompt_tokens_details=prompt_tokens_details,
|
820
849
|
)
|
821
850
|
|
822
851
|
final_usage_chunk = CompletionStreamResponse(
|
823
852
|
id=content["meta_info"]["id"],
|
853
|
+
created=created,
|
824
854
|
choices=[],
|
825
855
|
model=request.model,
|
826
856
|
usage=usage,
|
@@ -851,7 +881,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
851
881
|
if not isinstance(ret, list):
|
852
882
|
ret = [ret]
|
853
883
|
|
854
|
-
response = v1_generate_response(
|
884
|
+
response = v1_generate_response(
|
885
|
+
request,
|
886
|
+
ret,
|
887
|
+
tokenizer_manager,
|
888
|
+
created,
|
889
|
+
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
890
|
+
)
|
855
891
|
return response
|
856
892
|
|
857
893
|
|
@@ -863,6 +899,7 @@ def v1_chat_generate_request(
|
|
863
899
|
input_ids = []
|
864
900
|
sampling_params_list = []
|
865
901
|
image_data_list = []
|
902
|
+
audio_data_list = []
|
866
903
|
return_logprobs = []
|
867
904
|
logprob_start_lens = []
|
868
905
|
top_logprobs_nums = []
|
@@ -876,7 +913,9 @@ def v1_chat_generate_request(
|
|
876
913
|
# - prompt: The full prompt string.
|
877
914
|
# - stop: Custom stop tokens.
|
878
915
|
# - image_data: None or a list of image strings (URLs or base64 strings).
|
916
|
+
# - audio_data: None or a list of audio strings (URLs).
|
879
917
|
# None skips any image processing in GenerateReqInput.
|
918
|
+
strict_tag = None
|
880
919
|
if not isinstance(request.messages, str):
|
881
920
|
# Apply chat template and its stop strings.
|
882
921
|
tools = None
|
@@ -891,6 +930,10 @@ def v1_chat_generate_request(
|
|
891
930
|
else:
|
892
931
|
tools = [item.function.model_dump() for item in request.tools]
|
893
932
|
|
933
|
+
tool_call_parser = tokenizer_manager.server_args.tool_call_parser
|
934
|
+
parser = FunctionCallParser(request.tools, tool_call_parser)
|
935
|
+
strict_tag = parser.get_structure_tag()
|
936
|
+
|
894
937
|
if chat_template_name is None:
|
895
938
|
openai_compatible_messages = []
|
896
939
|
for message in request.messages:
|
@@ -920,7 +963,7 @@ def v1_chat_generate_request(
|
|
920
963
|
)
|
921
964
|
except:
|
922
965
|
# This except branch will be triggered when the chosen model
|
923
|
-
# has a different tools input format that is not
|
966
|
+
# has a different tools input format that is not compatible
|
924
967
|
# with openAI's apply_chat_template tool_call format, like Mistral.
|
925
968
|
tools = [t if "function" in t else {"function": t} for t in tools]
|
926
969
|
prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
|
@@ -940,11 +983,13 @@ def v1_chat_generate_request(
|
|
940
983
|
prompt_ids += encoded
|
941
984
|
stop = request.stop
|
942
985
|
image_data = None
|
986
|
+
audio_data = None
|
943
987
|
modalities = []
|
944
988
|
else:
|
945
989
|
conv = generate_chat_conv(request, chat_template_name)
|
946
990
|
prompt = conv.get_prompt()
|
947
991
|
image_data = conv.image_data
|
992
|
+
audio_data = conv.audio_data
|
948
993
|
modalities = conv.modalities
|
949
994
|
stop = conv.stop_str or []
|
950
995
|
if request.stop:
|
@@ -958,6 +1003,7 @@ def v1_chat_generate_request(
|
|
958
1003
|
prompt_ids = request.messages
|
959
1004
|
stop = request.stop
|
960
1005
|
image_data = None
|
1006
|
+
audio_data = None
|
961
1007
|
modalities = []
|
962
1008
|
input_ids.append(prompt_ids)
|
963
1009
|
return_logprobs.append(request.logprobs)
|
@@ -995,9 +1041,26 @@ def v1_chat_generate_request(
|
|
995
1041
|
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
996
1042
|
request.response_format.model_dump(by_alias=True)
|
997
1043
|
)
|
1044
|
+
|
1045
|
+
if strict_tag is not None:
|
1046
|
+
if (
|
1047
|
+
sampling_params.get("regex")
|
1048
|
+
or sampling_params.get("ebnf")
|
1049
|
+
or sampling_params.get("structural_tag")
|
1050
|
+
or sampling_params.get("json_schema")
|
1051
|
+
):
|
1052
|
+
logger.warning(
|
1053
|
+
"Constrained decoding is not compatible with tool calls."
|
1054
|
+
)
|
1055
|
+
else:
|
1056
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
1057
|
+
strict_tag.model_dump(by_alias=True)
|
1058
|
+
)
|
1059
|
+
|
998
1060
|
sampling_params_list.append(sampling_params)
|
999
1061
|
|
1000
1062
|
image_data_list.append(image_data)
|
1063
|
+
audio_data_list.append(audio_data)
|
1001
1064
|
modalities_list.append(modalities)
|
1002
1065
|
if len(all_requests) == 1:
|
1003
1066
|
if isinstance(input_ids[0], str):
|
@@ -1006,6 +1069,7 @@ def v1_chat_generate_request(
|
|
1006
1069
|
prompt_kwargs = {"input_ids": input_ids[0]}
|
1007
1070
|
sampling_params_list = sampling_params_list[0]
|
1008
1071
|
image_data_list = image_data_list[0]
|
1072
|
+
audio_data_list = audio_data_list[0]
|
1009
1073
|
return_logprobs = return_logprobs[0]
|
1010
1074
|
logprob_start_lens = logprob_start_lens[0]
|
1011
1075
|
top_logprobs_nums = top_logprobs_nums[0]
|
@@ -1020,6 +1084,7 @@ def v1_chat_generate_request(
|
|
1020
1084
|
adapted_request = GenerateReqInput(
|
1021
1085
|
**prompt_kwargs,
|
1022
1086
|
image_data=image_data_list,
|
1087
|
+
audio_data=audio_data_list,
|
1023
1088
|
sampling_params=sampling_params_list,
|
1024
1089
|
return_logprob=return_logprobs,
|
1025
1090
|
logprob_start_len=logprob_start_lens,
|
@@ -1037,6 +1102,7 @@ def v1_chat_generate_request(
|
|
1037
1102
|
def v1_chat_generate_response(
|
1038
1103
|
request,
|
1039
1104
|
ret,
|
1105
|
+
created,
|
1040
1106
|
to_file=False,
|
1041
1107
|
cache_report=False,
|
1042
1108
|
tool_call_parser=None,
|
@@ -1122,7 +1188,7 @@ def v1_chat_generate_response(
|
|
1122
1188
|
finish_reason["type"] = "tool_calls"
|
1123
1189
|
finish_reason["matched"] = None
|
1124
1190
|
try:
|
1125
|
-
|
1191
|
+
text, call_info_list = parser.parse_non_stream(text)
|
1126
1192
|
tool_calls = [
|
1127
1193
|
ToolCall(
|
1128
1194
|
id=str(call_info.tool_index),
|
@@ -1145,9 +1211,9 @@ def v1_chat_generate_response(
|
|
1145
1211
|
"index": 0,
|
1146
1212
|
"message": {
|
1147
1213
|
"role": "assistant",
|
1148
|
-
"content": text if
|
1214
|
+
"content": text if text else None,
|
1149
1215
|
"tool_calls": tool_calls,
|
1150
|
-
"reasoning_content": reasoning_text,
|
1216
|
+
"reasoning_content": reasoning_text if reasoning_text else None,
|
1151
1217
|
},
|
1152
1218
|
"logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
|
1153
1219
|
"finish_reason": (finish_reason["type"] if finish_reason else ""),
|
@@ -1162,9 +1228,9 @@ def v1_chat_generate_response(
|
|
1162
1228
|
index=idx,
|
1163
1229
|
message=ChatMessage(
|
1164
1230
|
role="assistant",
|
1165
|
-
content=text if
|
1231
|
+
content=text if text else None,
|
1166
1232
|
tool_calls=tool_calls,
|
1167
|
-
reasoning_content=reasoning_text,
|
1233
|
+
reasoning_content=reasoning_text if reasoning_text else None,
|
1168
1234
|
),
|
1169
1235
|
logprobs=choice_logprobs,
|
1170
1236
|
finish_reason=(finish_reason["type"] if finish_reason else ""),
|
@@ -1188,7 +1254,7 @@ def v1_chat_generate_response(
|
|
1188
1254
|
# remain the same but if needed we can change that
|
1189
1255
|
"id": ret[i]["meta_info"]["id"],
|
1190
1256
|
"object": "chat.completion",
|
1191
|
-
"created":
|
1257
|
+
"created": created,
|
1192
1258
|
"model": request[i].model,
|
1193
1259
|
"choices": choice,
|
1194
1260
|
"usage": {
|
@@ -1210,6 +1276,7 @@ def v1_chat_generate_response(
|
|
1210
1276
|
cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
|
1211
1277
|
response = ChatCompletionResponse(
|
1212
1278
|
id=ret[0]["meta_info"]["id"],
|
1279
|
+
created=created,
|
1213
1280
|
model=request.model,
|
1214
1281
|
choices=choices,
|
1215
1282
|
usage=UsageInfo(
|
@@ -1224,9 +1291,12 @@ def v1_chat_generate_response(
|
|
1224
1291
|
return response
|
1225
1292
|
|
1226
1293
|
|
1227
|
-
async def v1_chat_completions(
|
1294
|
+
async def v1_chat_completions(
|
1295
|
+
tokenizer_manager, raw_request: Request, cache_report=False
|
1296
|
+
):
|
1228
1297
|
request_json = await raw_request.json()
|
1229
1298
|
all_requests = [ChatCompletionRequest(**request_json)]
|
1299
|
+
created = int(time.time())
|
1230
1300
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
1231
1301
|
|
1232
1302
|
if adapted_request.stream:
|
@@ -1239,6 +1309,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1239
1309
|
n_prev_tokens = {}
|
1240
1310
|
prompt_tokens = {}
|
1241
1311
|
completion_tokens = {}
|
1312
|
+
cached_tokens = {}
|
1242
1313
|
try:
|
1243
1314
|
async for content in tokenizer_manager.generate_request(
|
1244
1315
|
adapted_request, raw_request
|
@@ -1252,6 +1323,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1252
1323
|
|
1253
1324
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
1254
1325
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
1326
|
+
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
1255
1327
|
if request.logprobs:
|
1256
1328
|
logprobs = to_openai_style_logprobs(
|
1257
1329
|
output_token_logprobs=content["meta_info"][
|
@@ -1309,9 +1381,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1309
1381
|
tokenizer_manager.server_args.reasoning_parser
|
1310
1382
|
and request.separate_reasoning
|
1311
1383
|
):
|
1312
|
-
delta = DeltaMessage(
|
1384
|
+
delta = DeltaMessage(
|
1385
|
+
role="assistant", reasoning_content=None
|
1386
|
+
)
|
1313
1387
|
else:
|
1314
|
-
delta = DeltaMessage(role="assistant", content=
|
1388
|
+
delta = DeltaMessage(role="assistant", content=None)
|
1315
1389
|
choice_data = ChatCompletionResponseStreamChoice(
|
1316
1390
|
index=index,
|
1317
1391
|
delta=delta,
|
@@ -1329,6 +1403,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1329
1403
|
)
|
1330
1404
|
chunk = ChatCompletionStreamResponse(
|
1331
1405
|
id=content["meta_info"]["id"],
|
1406
|
+
created=created,
|
1332
1407
|
choices=[choice_data],
|
1333
1408
|
model=request.model,
|
1334
1409
|
)
|
@@ -1354,7 +1429,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1354
1429
|
if reasoning_text:
|
1355
1430
|
choice_data = ChatCompletionResponseStreamChoice(
|
1356
1431
|
index=index,
|
1357
|
-
delta=DeltaMessage(
|
1432
|
+
delta=DeltaMessage(
|
1433
|
+
reasoning_content=(
|
1434
|
+
reasoning_text if reasoning_text else None
|
1435
|
+
)
|
1436
|
+
),
|
1358
1437
|
finish_reason=(
|
1359
1438
|
None
|
1360
1439
|
if finish_reason_type
|
@@ -1364,6 +1443,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1364
1443
|
)
|
1365
1444
|
chunk = ChatCompletionStreamResponse(
|
1366
1445
|
id=content["meta_info"]["id"],
|
1446
|
+
created=created,
|
1367
1447
|
choices=[choice_data],
|
1368
1448
|
model=request.model,
|
1369
1449
|
)
|
@@ -1388,7 +1468,9 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1388
1468
|
if normal_text:
|
1389
1469
|
choice_data = ChatCompletionResponseStreamChoice(
|
1390
1470
|
index=index,
|
1391
|
-
delta=DeltaMessage(
|
1471
|
+
delta=DeltaMessage(
|
1472
|
+
content=normal_text if normal_text else None
|
1473
|
+
),
|
1392
1474
|
finish_reason=(
|
1393
1475
|
None
|
1394
1476
|
if finish_reason_type
|
@@ -1398,6 +1480,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1398
1480
|
)
|
1399
1481
|
chunk = ChatCompletionStreamResponse(
|
1400
1482
|
id=content["meta_info"]["id"],
|
1483
|
+
created=created,
|
1401
1484
|
choices=[choice_data],
|
1402
1485
|
model=request.model,
|
1403
1486
|
)
|
@@ -1448,6 +1531,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1448
1531
|
)
|
1449
1532
|
chunk = ChatCompletionStreamResponse(
|
1450
1533
|
id=content["meta_info"]["id"],
|
1534
|
+
created=created,
|
1451
1535
|
choices=[choice_data],
|
1452
1536
|
model=request.model,
|
1453
1537
|
)
|
@@ -1460,7 +1544,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1460
1544
|
# No tool calls => just treat this as normal text
|
1461
1545
|
choice_data = ChatCompletionResponseStreamChoice(
|
1462
1546
|
index=index,
|
1463
|
-
delta=DeltaMessage(content=delta),
|
1547
|
+
delta=DeltaMessage(content=delta if delta else None),
|
1464
1548
|
finish_reason=(
|
1465
1549
|
None
|
1466
1550
|
if finish_reason_type and len(finish_reason_type) == 0
|
@@ -1475,6 +1559,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1475
1559
|
)
|
1476
1560
|
chunk = ChatCompletionStreamResponse(
|
1477
1561
|
id=content["meta_info"]["id"],
|
1562
|
+
created=created,
|
1478
1563
|
choices=[choice_data],
|
1479
1564
|
model=request.model,
|
1480
1565
|
)
|
@@ -1490,14 +1575,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1490
1575
|
total_completion_tokens = sum(
|
1491
1576
|
tokens for tokens in completion_tokens.values()
|
1492
1577
|
)
|
1578
|
+
cache_report = tokenizer_manager.server_args.enable_cache_report
|
1579
|
+
if cache_report:
|
1580
|
+
cached_tokens_sum = sum(
|
1581
|
+
tokens for tokens in cached_tokens.values()
|
1582
|
+
)
|
1583
|
+
prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
|
1584
|
+
else:
|
1585
|
+
prompt_tokens_details = None
|
1493
1586
|
usage = UsageInfo(
|
1494
1587
|
prompt_tokens=total_prompt_tokens,
|
1495
1588
|
completion_tokens=total_completion_tokens,
|
1496
1589
|
total_tokens=total_prompt_tokens + total_completion_tokens,
|
1590
|
+
prompt_tokens_details=prompt_tokens_details,
|
1497
1591
|
)
|
1498
1592
|
|
1499
1593
|
final_usage_chunk = ChatCompletionStreamResponse(
|
1500
1594
|
id=content["meta_info"]["id"],
|
1595
|
+
created=created,
|
1501
1596
|
choices=[],
|
1502
1597
|
model=request.model,
|
1503
1598
|
usage=usage,
|
@@ -1530,6 +1625,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1530
1625
|
response = v1_chat_generate_response(
|
1531
1626
|
request,
|
1532
1627
|
ret,
|
1628
|
+
created,
|
1533
1629
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
1534
1630
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
1535
1631
|
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
|
@@ -16,7 +16,7 @@
|
|
16
16
|
import time
|
17
17
|
from typing import Dict, List, Optional, Union
|
18
18
|
|
19
|
-
from pydantic import BaseModel, Field
|
19
|
+
from pydantic import BaseModel, Field, root_validator
|
20
20
|
from typing_extensions import Literal
|
21
21
|
|
22
22
|
|
@@ -227,14 +227,25 @@ class ChatCompletionMessageContentImageURL(BaseModel):
|
|
227
227
|
detail: Optional[Literal["auto", "low", "high"]] = "auto"
|
228
228
|
|
229
229
|
|
230
|
+
class ChatCompletionMessageContentAudioURL(BaseModel):
|
231
|
+
url: str
|
232
|
+
|
233
|
+
|
230
234
|
class ChatCompletionMessageContentImagePart(BaseModel):
|
231
235
|
type: Literal["image_url"]
|
232
236
|
image_url: ChatCompletionMessageContentImageURL
|
233
237
|
modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
|
234
238
|
|
235
239
|
|
240
|
+
class ChatCompletionMessageContentAudioPart(BaseModel):
|
241
|
+
type: Literal["audio_url"]
|
242
|
+
audio_url: ChatCompletionMessageContentAudioURL
|
243
|
+
|
244
|
+
|
236
245
|
ChatCompletionMessageContentPart = Union[
|
237
|
-
ChatCompletionMessageContentTextPart,
|
246
|
+
ChatCompletionMessageContentTextPart,
|
247
|
+
ChatCompletionMessageContentImagePart,
|
248
|
+
ChatCompletionMessageContentAudioPart,
|
238
249
|
]
|
239
250
|
|
240
251
|
|
@@ -276,6 +287,7 @@ class Function(BaseModel):
|
|
276
287
|
description: Optional[str] = Field(default=None, examples=[None])
|
277
288
|
name: Optional[str] = None
|
278
289
|
parameters: Optional[object] = None
|
290
|
+
strict: bool = False
|
279
291
|
|
280
292
|
|
281
293
|
class Tool(BaseModel):
|
@@ -323,6 +335,15 @@ class ChatCompletionRequest(BaseModel):
|
|
323
335
|
default="auto", examples=["none"]
|
324
336
|
) # noqa
|
325
337
|
|
338
|
+
@root_validator(pre=True)
|
339
|
+
def set_tool_choice_default(cls, values):
|
340
|
+
if values.get("tool_choice") is None:
|
341
|
+
if values.get("tools") is None:
|
342
|
+
values["tool_choice"] = "none"
|
343
|
+
else:
|
344
|
+
values["tool_choice"] = "auto"
|
345
|
+
return values
|
346
|
+
|
326
347
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
327
348
|
top_k: int = -1
|
328
349
|
min_p: float = 0.0
|
@@ -306,7 +306,7 @@ class SamplingBatchInfo:
|
|
306
306
|
]:
|
307
307
|
self_val = getattr(self, item, None)
|
308
308
|
other_val = getattr(other, item, None)
|
309
|
-
setattr(self, item, torch.
|
309
|
+
setattr(self, item, torch.cat([self_val, other_val]))
|
310
310
|
|
311
311
|
self.is_all_greedy |= other.is_all_greedy
|
312
312
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
@@ -77,7 +77,7 @@ class SamplingParams:
|
|
77
77
|
self.custom_params = custom_params
|
78
78
|
|
79
79
|
# Process some special cases
|
80
|
-
if self.temperature < _SAMPLING_EPS:
|
80
|
+
if 0 <= self.temperature < _SAMPLING_EPS:
|
81
81
|
# top_k = 1 means greedy sampling
|
82
82
|
self.temperature = 1.0
|
83
83
|
self.top_k = 1
|
@@ -93,9 +93,9 @@ class SamplingParams:
|
|
93
93
|
raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
|
94
94
|
if not 0.0 <= self.min_p <= 1.0:
|
95
95
|
raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
|
96
|
-
if self.top_k <
|
96
|
+
if self.top_k < 1 or self.top_k == -1:
|
97
97
|
raise ValueError(
|
98
|
-
f"top_k must be -1 (disable)
|
98
|
+
f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
|
99
99
|
)
|
100
100
|
if not -2.0 <= self.frequency_penalty <= 2.0:
|
101
101
|
raise ValueError(
|
@@ -108,12 +108,12 @@ class SamplingParams:
|
|
108
108
|
)
|
109
109
|
if not 0.0 <= self.repetition_penalty <= 2.0:
|
110
110
|
raise ValueError(
|
111
|
-
"repetition_penalty must be in
|
111
|
+
"repetition_penalty must be in [0, 2], got "
|
112
112
|
f"{self.repetition_penalty}."
|
113
113
|
)
|
114
114
|
if not 0 <= self.min_new_tokens:
|
115
115
|
raise ValueError(
|
116
|
-
f"min_new_tokens must be in
|
116
|
+
f"min_new_tokens must be in [0, max_new_tokens], got "
|
117
117
|
f"{self.min_new_tokens}."
|
118
118
|
)
|
119
119
|
if self.max_new_tokens is not None:
|
@@ -123,7 +123,7 @@ class SamplingParams:
|
|
123
123
|
)
|
124
124
|
if not self.min_new_tokens <= self.max_new_tokens:
|
125
125
|
raise ValueError(
|
126
|
-
f"min_new_tokens must be in
|
126
|
+
f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
|
127
127
|
f"{self.min_new_tokens}."
|
128
128
|
)
|
129
129
|
grammars = [
|