sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +3 -1
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +667 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +63 -11
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/parallel_state.py +10 -3
  34. sglang/srt/entrypoints/engine.py +55 -5
  35. sglang/srt/entrypoints/http_server.py +71 -12
  36. sglang/srt/function_call_parser.py +164 -54
  37. sglang/srt/hf_transformers_utils.py +28 -3
  38. sglang/srt/layers/activation.py +4 -2
  39. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +295 -0
  41. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  42. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  43. sglang/srt/layers/attention/triton_backend.py +171 -38
  44. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  45. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  46. sglang/srt/layers/attention/utils.py +53 -0
  47. sglang/srt/layers/attention/vision.py +9 -28
  48. sglang/srt/layers/dp_attention.py +62 -23
  49. sglang/srt/layers/elementwise.py +411 -0
  50. sglang/srt/layers/layernorm.py +24 -2
  51. sglang/srt/layers/linear.py +17 -5
  52. sglang/srt/layers/logits_processor.py +26 -7
  53. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  54. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  55. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  56. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  63. sglang/srt/layers/moe/router.py +342 -0
  64. sglang/srt/layers/moe/topk.py +31 -18
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +184 -126
  67. sglang/srt/layers/quantization/base_config.py +5 -0
  68. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  69. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  70. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  72. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  75. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  76. sglang/srt/layers/quantization/fp8.py +76 -34
  77. sglang/srt/layers/quantization/fp8_kernel.py +24 -8
  78. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  79. sglang/srt/layers/quantization/gptq.py +36 -9
  80. sglang/srt/layers/quantization/kv_cache.py +98 -0
  81. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  82. sglang/srt/layers/quantization/utils.py +153 -0
  83. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  84. sglang/srt/layers/rotary_embedding.py +66 -87
  85. sglang/srt/layers/sampler.py +1 -1
  86. sglang/srt/lora/layers.py +68 -0
  87. sglang/srt/lora/lora.py +2 -22
  88. sglang/srt/lora/lora_manager.py +47 -23
  89. sglang/srt/lora/mem_pool.py +110 -51
  90. sglang/srt/lora/utils.py +12 -1
  91. sglang/srt/managers/cache_controller.py +4 -5
  92. sglang/srt/managers/data_parallel_controller.py +31 -9
  93. sglang/srt/managers/expert_distribution.py +81 -0
  94. sglang/srt/managers/io_struct.py +39 -3
  95. sglang/srt/managers/mm_utils.py +373 -0
  96. sglang/srt/managers/multimodal_processor.py +68 -0
  97. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  98. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  99. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  100. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  101. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  102. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  103. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  104. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  105. sglang/srt/managers/schedule_batch.py +134 -31
  106. sglang/srt/managers/scheduler.py +325 -38
  107. sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
  108. sglang/srt/managers/session_controller.py +1 -1
  109. sglang/srt/managers/tokenizer_manager.py +59 -23
  110. sglang/srt/managers/tp_worker.py +1 -1
  111. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  112. sglang/srt/managers/utils.py +6 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +27 -8
  114. sglang/srt/mem_cache/memory_pool.py +258 -98
  115. sglang/srt/mem_cache/paged_allocator.py +2 -2
  116. sglang/srt/mem_cache/radix_cache.py +4 -4
  117. sglang/srt/model_executor/cuda_graph_runner.py +85 -28
  118. sglang/srt/model_executor/forward_batch_info.py +81 -15
  119. sglang/srt/model_executor/model_runner.py +70 -6
  120. sglang/srt/model_loader/loader.py +160 -2
  121. sglang/srt/model_loader/weight_utils.py +45 -0
  122. sglang/srt/models/deepseek_janus_pro.py +29 -86
  123. sglang/srt/models/deepseek_nextn.py +22 -10
  124. sglang/srt/models/deepseek_v2.py +326 -192
  125. sglang/srt/models/deepseek_vl2.py +358 -0
  126. sglang/srt/models/gemma3_causal.py +684 -0
  127. sglang/srt/models/gemma3_mm.py +462 -0
  128. sglang/srt/models/grok.py +374 -119
  129. sglang/srt/models/llama.py +47 -7
  130. sglang/srt/models/llama_eagle.py +1 -0
  131. sglang/srt/models/llama_eagle3.py +196 -0
  132. sglang/srt/models/llava.py +3 -3
  133. sglang/srt/models/llavavid.py +3 -3
  134. sglang/srt/models/minicpmo.py +1995 -0
  135. sglang/srt/models/minicpmv.py +62 -137
  136. sglang/srt/models/mllama.py +4 -4
  137. sglang/srt/models/phi3_small.py +1 -1
  138. sglang/srt/models/qwen2.py +3 -0
  139. sglang/srt/models/qwen2_5_vl.py +68 -146
  140. sglang/srt/models/qwen2_classification.py +75 -0
  141. sglang/srt/models/qwen2_moe.py +9 -1
  142. sglang/srt/models/qwen2_vl.py +25 -63
  143. sglang/srt/openai_api/adapter.py +145 -47
  144. sglang/srt/openai_api/protocol.py +23 -2
  145. sglang/srt/sampling/sampling_batch_info.py +1 -1
  146. sglang/srt/sampling/sampling_params.py +6 -6
  147. sglang/srt/server_args.py +104 -14
  148. sglang/srt/speculative/build_eagle_tree.py +7 -347
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  150. sglang/srt/speculative/eagle_utils.py +208 -252
  151. sglang/srt/speculative/eagle_worker.py +139 -53
  152. sglang/srt/speculative/spec_info.py +6 -1
  153. sglang/srt/torch_memory_saver_adapter.py +22 -0
  154. sglang/srt/utils.py +182 -21
  155. sglang/test/__init__.py +0 -0
  156. sglang/test/attention/__init__.py +0 -0
  157. sglang/test/attention/test_flashattn_backend.py +312 -0
  158. sglang/test/runners.py +2 -0
  159. sglang/test/test_activation.py +2 -1
  160. sglang/test/test_block_fp8.py +5 -4
  161. sglang/test/test_block_fp8_ep.py +2 -1
  162. sglang/test/test_dynamic_grad_mode.py +58 -0
  163. sglang/test/test_layernorm.py +3 -2
  164. sglang/test/test_utils.py +55 -4
  165. sglang/utils.py +31 -0
  166. sglang/version.py +1 -1
  167. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
  168. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
  169. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
  170. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  171. sglang/srt/managers/image_processor.py +0 -55
  172. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  173. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  174. sglang/srt/managers/multi_modality_padding.py +0 -134
  175. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
  176. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
@@ -20,19 +20,16 @@ import os
20
20
  import time
21
21
  import uuid
22
22
  from http import HTTPStatus
23
- from typing import Dict, List
23
+ from typing import Any, Dict, List, Set
24
24
 
25
25
  from fastapi import HTTPException, Request, UploadFile
26
26
  from fastapi.responses import ORJSONResponse, StreamingResponse
27
27
  from pydantic import ValidationError
28
28
 
29
- try:
30
- from outlines.fsm.json_schema import convert_json_schema_to_str
31
- except ImportError:
32
- # Before outlines 0.0.47, convert_json_schema_to_str is under
33
- # outlines.integrations.utils
34
- from outlines.integrations.utils import convert_json_schema_to_str
35
-
29
+ from sglang.srt.code_completion_parser import (
30
+ generate_completion_prompt_from_request,
31
+ is_completion_template_defined,
32
+ )
36
33
  from sglang.srt.conversation import (
37
34
  Conversation,
38
35
  SeparatorStyle,
@@ -41,7 +38,7 @@ from sglang.srt.conversation import (
41
38
  generate_embedding_convs,
42
39
  register_conv_template,
43
40
  )
44
- from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser
41
+ from sglang.srt.function_call_parser import FunctionCallParser
45
42
  from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
46
43
  from sglang.srt.openai_api.protocol import (
47
44
  BatchRequest,
@@ -75,7 +72,7 @@ from sglang.srt.openai_api.protocol import (
75
72
  UsageInfo,
76
73
  )
77
74
  from sglang.srt.reasoning_parser import ReasoningParser
78
- from sglang.utils import get_exception_traceback
75
+ from sglang.utils import convert_json_schema_to_str, get_exception_traceback
79
76
 
80
77
  logger = logging.getLogger(__name__)
81
78
 
@@ -310,6 +307,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
310
307
  )
311
308
 
312
309
  try:
310
+ created = int(time.time())
313
311
  ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
314
312
  if not isinstance(ret, list):
315
313
  ret = [ret]
@@ -317,13 +315,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
317
315
  responses = v1_chat_generate_response(
318
316
  request,
319
317
  ret,
318
+ created,
320
319
  to_file=True,
321
320
  cache_report=tokenizer_manager.server_args.enable_cache_report,
322
321
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
323
322
  )
324
323
  else:
325
324
  responses = v1_generate_response(
326
- request, ret, tokenizer_manager, to_file=True
325
+ request,
326
+ ret,
327
+ tokenizer_manager,
328
+ created,
329
+ to_file=True,
330
+ cache_report=tokenizer_manager.server_args.enable_cache_report,
327
331
  )
328
332
 
329
333
  except Exception as e:
@@ -504,7 +508,11 @@ def v1_generate_request(
504
508
  "To compute logprobs of input prompt, please use the native /generate API."
505
509
  )
506
510
 
507
- prompts.append(request.prompt)
511
+ prompt = request.prompt
512
+ if is_completion_template_defined():
513
+ prompt = generate_completion_prompt_from_request(request)
514
+ prompts.append(prompt)
515
+
508
516
  lora_paths.append(request.lora_path)
509
517
  if request.echo and request.logprobs:
510
518
  current_logprob_start_len = 0
@@ -569,7 +577,9 @@ def v1_generate_request(
569
577
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
570
578
 
571
579
 
572
- def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
580
+ def v1_generate_response(
581
+ request, ret, tokenizer_manager, created, to_file=False, cache_report=False
582
+ ):
573
583
  choices = []
574
584
  echo = False
575
585
 
@@ -667,7 +677,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
667
677
  # remain the same but if needed we can change that
668
678
  "id": ret[i]["meta_info"]["id"],
669
679
  "object": "text_completion",
670
- "created": int(time.time()),
680
+ "created": created,
671
681
  "model": request[i].model,
672
682
  "choices": choice,
673
683
  "usage": {
@@ -686,14 +696,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
686
696
  ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
687
697
  )
688
698
  completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
699
+ cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
689
700
  response = CompletionResponse(
690
701
  id=ret[0]["meta_info"]["id"],
691
702
  model=request.model,
703
+ created=created,
692
704
  choices=choices,
693
705
  usage=UsageInfo(
694
706
  prompt_tokens=prompt_tokens,
695
707
  completion_tokens=completion_tokens,
696
708
  total_tokens=prompt_tokens + completion_tokens,
709
+ prompt_tokens_details=(
710
+ {"cached_tokens": cached_tokens} if cache_report else None
711
+ ),
697
712
  ),
698
713
  )
699
714
  return response
@@ -702,6 +717,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
702
717
  async def v1_completions(tokenizer_manager, raw_request: Request):
703
718
  request_json = await raw_request.json()
704
719
  all_requests = [CompletionRequest(**request_json)]
720
+ created = int(time.time())
705
721
  adapted_request, request = v1_generate_request(all_requests)
706
722
 
707
723
  if adapted_request.stream:
@@ -711,6 +727,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
711
727
  n_prev_tokens = {}
712
728
  prompt_tokens = {}
713
729
  completion_tokens = {}
730
+ cached_tokens = {}
731
+
714
732
  try:
715
733
  async for content in tokenizer_manager.generate_request(
716
734
  adapted_request, raw_request
@@ -723,6 +741,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
723
741
  text = content["text"]
724
742
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
725
743
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
744
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
726
745
 
727
746
  if not stream_buffer: # The first chunk
728
747
  if request.echo:
@@ -795,6 +814,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
795
814
  )
796
815
  chunk = CompletionStreamResponse(
797
816
  id=content["meta_info"]["id"],
817
+ created=created,
798
818
  object="text_completion",
799
819
  choices=[choice_data],
800
820
  model=request.model,
@@ -813,14 +833,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
813
833
  total_completion_tokens = sum(
814
834
  tokens for tokens in completion_tokens.values()
815
835
  )
836
+ cache_report = tokenizer_manager.server_args.enable_cache_report
837
+ if cache_report:
838
+ cached_tokens_sum = sum(
839
+ tokens for tokens in cached_tokens.values()
840
+ )
841
+ prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
842
+ else:
843
+ prompt_tokens_details = None
816
844
  usage = UsageInfo(
817
845
  prompt_tokens=total_prompt_tokens,
818
846
  completion_tokens=total_completion_tokens,
819
847
  total_tokens=total_prompt_tokens + total_completion_tokens,
848
+ prompt_tokens_details=prompt_tokens_details,
820
849
  )
821
850
 
822
851
  final_usage_chunk = CompletionStreamResponse(
823
852
  id=content["meta_info"]["id"],
853
+ created=created,
824
854
  choices=[],
825
855
  model=request.model,
826
856
  usage=usage,
@@ -851,7 +881,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
851
881
  if not isinstance(ret, list):
852
882
  ret = [ret]
853
883
 
854
- response = v1_generate_response(request, ret, tokenizer_manager)
884
+ response = v1_generate_response(
885
+ request,
886
+ ret,
887
+ tokenizer_manager,
888
+ created,
889
+ cache_report=tokenizer_manager.server_args.enable_cache_report,
890
+ )
855
891
  return response
856
892
 
857
893
 
@@ -863,6 +899,7 @@ def v1_chat_generate_request(
863
899
  input_ids = []
864
900
  sampling_params_list = []
865
901
  image_data_list = []
902
+ audio_data_list = []
866
903
  return_logprobs = []
867
904
  logprob_start_lens = []
868
905
  top_logprobs_nums = []
@@ -876,7 +913,9 @@ def v1_chat_generate_request(
876
913
  # - prompt: The full prompt string.
877
914
  # - stop: Custom stop tokens.
878
915
  # - image_data: None or a list of image strings (URLs or base64 strings).
916
+ # - audio_data: None or a list of audio strings (URLs).
879
917
  # None skips any image processing in GenerateReqInput.
918
+ strict_tag = None
880
919
  if not isinstance(request.messages, str):
881
920
  # Apply chat template and its stop strings.
882
921
  tools = None
@@ -891,6 +930,10 @@ def v1_chat_generate_request(
891
930
  else:
892
931
  tools = [item.function.model_dump() for item in request.tools]
893
932
 
933
+ tool_call_parser = tokenizer_manager.server_args.tool_call_parser
934
+ parser = FunctionCallParser(request.tools, tool_call_parser)
935
+ strict_tag = parser.get_structure_tag()
936
+
894
937
  if chat_template_name is None:
895
938
  openai_compatible_messages = []
896
939
  for message in request.messages:
@@ -920,7 +963,7 @@ def v1_chat_generate_request(
920
963
  )
921
964
  except:
922
965
  # This except branch will be triggered when the chosen model
923
- # has a different tools input format that is not compatiable
966
+ # has a different tools input format that is not compatible
924
967
  # with openAI's apply_chat_template tool_call format, like Mistral.
925
968
  tools = [t if "function" in t else {"function": t} for t in tools]
926
969
  prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
@@ -940,11 +983,13 @@ def v1_chat_generate_request(
940
983
  prompt_ids += encoded
941
984
  stop = request.stop
942
985
  image_data = None
986
+ audio_data = None
943
987
  modalities = []
944
988
  else:
945
989
  conv = generate_chat_conv(request, chat_template_name)
946
990
  prompt = conv.get_prompt()
947
991
  image_data = conv.image_data
992
+ audio_data = conv.audio_data
948
993
  modalities = conv.modalities
949
994
  stop = conv.stop_str or []
950
995
  if request.stop:
@@ -958,6 +1003,7 @@ def v1_chat_generate_request(
958
1003
  prompt_ids = request.messages
959
1004
  stop = request.stop
960
1005
  image_data = None
1006
+ audio_data = None
961
1007
  modalities = []
962
1008
  input_ids.append(prompt_ids)
963
1009
  return_logprobs.append(request.logprobs)
@@ -995,9 +1041,26 @@ def v1_chat_generate_request(
995
1041
  sampling_params["structural_tag"] = convert_json_schema_to_str(
996
1042
  request.response_format.model_dump(by_alias=True)
997
1043
  )
1044
+
1045
+ if strict_tag is not None:
1046
+ if (
1047
+ sampling_params.get("regex")
1048
+ or sampling_params.get("ebnf")
1049
+ or sampling_params.get("structural_tag")
1050
+ or sampling_params.get("json_schema")
1051
+ ):
1052
+ logger.warning(
1053
+ "Constrained decoding is not compatible with tool calls."
1054
+ )
1055
+ else:
1056
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
1057
+ strict_tag.model_dump(by_alias=True)
1058
+ )
1059
+
998
1060
  sampling_params_list.append(sampling_params)
999
1061
 
1000
1062
  image_data_list.append(image_data)
1063
+ audio_data_list.append(audio_data)
1001
1064
  modalities_list.append(modalities)
1002
1065
  if len(all_requests) == 1:
1003
1066
  if isinstance(input_ids[0], str):
@@ -1006,6 +1069,7 @@ def v1_chat_generate_request(
1006
1069
  prompt_kwargs = {"input_ids": input_ids[0]}
1007
1070
  sampling_params_list = sampling_params_list[0]
1008
1071
  image_data_list = image_data_list[0]
1072
+ audio_data_list = audio_data_list[0]
1009
1073
  return_logprobs = return_logprobs[0]
1010
1074
  logprob_start_lens = logprob_start_lens[0]
1011
1075
  top_logprobs_nums = top_logprobs_nums[0]
@@ -1020,6 +1084,7 @@ def v1_chat_generate_request(
1020
1084
  adapted_request = GenerateReqInput(
1021
1085
  **prompt_kwargs,
1022
1086
  image_data=image_data_list,
1087
+ audio_data=audio_data_list,
1023
1088
  sampling_params=sampling_params_list,
1024
1089
  return_logprob=return_logprobs,
1025
1090
  logprob_start_len=logprob_start_lens,
@@ -1037,6 +1102,7 @@ def v1_chat_generate_request(
1037
1102
  def v1_chat_generate_response(
1038
1103
  request,
1039
1104
  ret,
1105
+ created,
1040
1106
  to_file=False,
1041
1107
  cache_report=False,
1042
1108
  tool_call_parser=None,
@@ -1115,27 +1181,29 @@ def v1_chat_generate_response(
1115
1181
  else:
1116
1182
  reasoning_text = None
1117
1183
 
1118
- if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
1119
- if finish_reason == "stop":
1120
- finish_reason = "tool_calls"
1121
- try:
1122
- parser = FunctionCallParser(tools, tool_call_parser)
1123
- full_normal_text, call_info_list = parser.parse_non_stream(text)
1124
- tool_calls = [
1125
- ToolCall(
1126
- id=str(call_info.tool_index),
1127
- function=FunctionResponse(
1128
- name=call_info.name, arguments=call_info.parameters
1129
- ),
1184
+ if tool_choice != "none" and tools:
1185
+ parser = FunctionCallParser(tools, tool_call_parser)
1186
+ if parser.has_tool_call(text):
1187
+ if finish_reason["type"] == "stop":
1188
+ finish_reason["type"] = "tool_calls"
1189
+ finish_reason["matched"] = None
1190
+ try:
1191
+ text, call_info_list = parser.parse_non_stream(text)
1192
+ tool_calls = [
1193
+ ToolCall(
1194
+ id=str(call_info.tool_index),
1195
+ function=FunctionResponse(
1196
+ name=call_info.name, arguments=call_info.parameters
1197
+ ),
1198
+ )
1199
+ for call_info in call_info_list
1200
+ ]
1201
+ except Exception as e:
1202
+ logger.error(f"Exception: {e}")
1203
+ return create_error_response(
1204
+ HTTPStatus.BAD_REQUEST,
1205
+ "Failed to parse fc related info to json format!",
1130
1206
  )
1131
- for call_info in call_info_list
1132
- ]
1133
- except Exception as e:
1134
- logger.error(f"Exception: {e}")
1135
- return create_error_response(
1136
- HTTPStatus.BAD_REQUEST,
1137
- "Failed to parse fc related info to json format!",
1138
- )
1139
1207
 
1140
1208
  if to_file:
1141
1209
  # to make the choice data json serializable
@@ -1143,9 +1211,9 @@ def v1_chat_generate_response(
1143
1211
  "index": 0,
1144
1212
  "message": {
1145
1213
  "role": "assistant",
1146
- "content": text if tool_calls is None else None,
1214
+ "content": text if text else None,
1147
1215
  "tool_calls": tool_calls,
1148
- "reasoning_content": reasoning_text,
1216
+ "reasoning_content": reasoning_text if reasoning_text else None,
1149
1217
  },
1150
1218
  "logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
1151
1219
  "finish_reason": (finish_reason["type"] if finish_reason else ""),
@@ -1160,9 +1228,9 @@ def v1_chat_generate_response(
1160
1228
  index=idx,
1161
1229
  message=ChatMessage(
1162
1230
  role="assistant",
1163
- content=text if tool_calls is None else None,
1231
+ content=text if text else None,
1164
1232
  tool_calls=tool_calls,
1165
- reasoning_content=reasoning_text,
1233
+ reasoning_content=reasoning_text if reasoning_text else None,
1166
1234
  ),
1167
1235
  logprobs=choice_logprobs,
1168
1236
  finish_reason=(finish_reason["type"] if finish_reason else ""),
@@ -1186,7 +1254,7 @@ def v1_chat_generate_response(
1186
1254
  # remain the same but if needed we can change that
1187
1255
  "id": ret[i]["meta_info"]["id"],
1188
1256
  "object": "chat.completion",
1189
- "created": int(time.time()),
1257
+ "created": created,
1190
1258
  "model": request[i].model,
1191
1259
  "choices": choice,
1192
1260
  "usage": {
@@ -1208,6 +1276,7 @@ def v1_chat_generate_response(
1208
1276
  cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
1209
1277
  response = ChatCompletionResponse(
1210
1278
  id=ret[0]["meta_info"]["id"],
1279
+ created=created,
1211
1280
  model=request.model,
1212
1281
  choices=choices,
1213
1282
  usage=UsageInfo(
@@ -1222,9 +1291,12 @@ def v1_chat_generate_response(
1222
1291
  return response
1223
1292
 
1224
1293
 
1225
- async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1294
+ async def v1_chat_completions(
1295
+ tokenizer_manager, raw_request: Request, cache_report=False
1296
+ ):
1226
1297
  request_json = await raw_request.json()
1227
1298
  all_requests = [ChatCompletionRequest(**request_json)]
1299
+ created = int(time.time())
1228
1300
  adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
1229
1301
 
1230
1302
  if adapted_request.stream:
@@ -1237,6 +1309,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1237
1309
  n_prev_tokens = {}
1238
1310
  prompt_tokens = {}
1239
1311
  completion_tokens = {}
1312
+ cached_tokens = {}
1240
1313
  try:
1241
1314
  async for content in tokenizer_manager.generate_request(
1242
1315
  adapted_request, raw_request
@@ -1250,6 +1323,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1250
1323
 
1251
1324
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
1252
1325
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
1326
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
1253
1327
  if request.logprobs:
1254
1328
  logprobs = to_openai_style_logprobs(
1255
1329
  output_token_logprobs=content["meta_info"][
@@ -1307,9 +1381,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1307
1381
  tokenizer_manager.server_args.reasoning_parser
1308
1382
  and request.separate_reasoning
1309
1383
  ):
1310
- delta = DeltaMessage(role="assistant", reasoning_content="")
1384
+ delta = DeltaMessage(
1385
+ role="assistant", reasoning_content=None
1386
+ )
1311
1387
  else:
1312
- delta = DeltaMessage(role="assistant", content="")
1388
+ delta = DeltaMessage(role="assistant", content=None)
1313
1389
  choice_data = ChatCompletionResponseStreamChoice(
1314
1390
  index=index,
1315
1391
  delta=delta,
@@ -1327,6 +1403,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1327
1403
  )
1328
1404
  chunk = ChatCompletionStreamResponse(
1329
1405
  id=content["meta_info"]["id"],
1406
+ created=created,
1330
1407
  choices=[choice_data],
1331
1408
  model=request.model,
1332
1409
  )
@@ -1352,7 +1429,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1352
1429
  if reasoning_text:
1353
1430
  choice_data = ChatCompletionResponseStreamChoice(
1354
1431
  index=index,
1355
- delta=DeltaMessage(reasoning_content=reasoning_text),
1432
+ delta=DeltaMessage(
1433
+ reasoning_content=(
1434
+ reasoning_text if reasoning_text else None
1435
+ )
1436
+ ),
1356
1437
  finish_reason=(
1357
1438
  None
1358
1439
  if finish_reason_type
@@ -1362,6 +1443,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1362
1443
  )
1363
1444
  chunk = ChatCompletionStreamResponse(
1364
1445
  id=content["meta_info"]["id"],
1446
+ created=created,
1365
1447
  choices=[choice_data],
1366
1448
  model=request.model,
1367
1449
  )
@@ -1386,7 +1468,9 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1386
1468
  if normal_text:
1387
1469
  choice_data = ChatCompletionResponseStreamChoice(
1388
1470
  index=index,
1389
- delta=DeltaMessage(content=normal_text),
1471
+ delta=DeltaMessage(
1472
+ content=normal_text if normal_text else None
1473
+ ),
1390
1474
  finish_reason=(
1391
1475
  None
1392
1476
  if finish_reason_type
@@ -1396,6 +1480,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1396
1480
  )
1397
1481
  chunk = ChatCompletionStreamResponse(
1398
1482
  id=content["meta_info"]["id"],
1483
+ created=created,
1399
1484
  choices=[choice_data],
1400
1485
  model=request.model,
1401
1486
  )
@@ -1446,6 +1531,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1446
1531
  )
1447
1532
  chunk = ChatCompletionStreamResponse(
1448
1533
  id=content["meta_info"]["id"],
1534
+ created=created,
1449
1535
  choices=[choice_data],
1450
1536
  model=request.model,
1451
1537
  )
@@ -1458,7 +1544,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1458
1544
  # No tool calls => just treat this as normal text
1459
1545
  choice_data = ChatCompletionResponseStreamChoice(
1460
1546
  index=index,
1461
- delta=DeltaMessage(content=delta),
1547
+ delta=DeltaMessage(content=delta if delta else None),
1462
1548
  finish_reason=(
1463
1549
  None
1464
1550
  if finish_reason_type and len(finish_reason_type) == 0
@@ -1473,6 +1559,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1473
1559
  )
1474
1560
  chunk = ChatCompletionStreamResponse(
1475
1561
  id=content["meta_info"]["id"],
1562
+ created=created,
1476
1563
  choices=[choice_data],
1477
1564
  model=request.model,
1478
1565
  )
@@ -1488,14 +1575,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1488
1575
  total_completion_tokens = sum(
1489
1576
  tokens for tokens in completion_tokens.values()
1490
1577
  )
1578
+ cache_report = tokenizer_manager.server_args.enable_cache_report
1579
+ if cache_report:
1580
+ cached_tokens_sum = sum(
1581
+ tokens for tokens in cached_tokens.values()
1582
+ )
1583
+ prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
1584
+ else:
1585
+ prompt_tokens_details = None
1491
1586
  usage = UsageInfo(
1492
1587
  prompt_tokens=total_prompt_tokens,
1493
1588
  completion_tokens=total_completion_tokens,
1494
1589
  total_tokens=total_prompt_tokens + total_completion_tokens,
1590
+ prompt_tokens_details=prompt_tokens_details,
1495
1591
  )
1496
1592
 
1497
1593
  final_usage_chunk = ChatCompletionStreamResponse(
1498
1594
  id=content["meta_info"]["id"],
1595
+ created=created,
1499
1596
  choices=[],
1500
1597
  model=request.model,
1501
1598
  usage=usage,
@@ -1528,6 +1625,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1528
1625
  response = v1_chat_generate_response(
1529
1626
  request,
1530
1627
  ret,
1628
+ created,
1531
1629
  cache_report=tokenizer_manager.server_args.enable_cache_report,
1532
1630
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
1533
1631
  reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
@@ -16,7 +16,7 @@
16
16
  import time
17
17
  from typing import Dict, List, Optional, Union
18
18
 
19
- from pydantic import BaseModel, Field
19
+ from pydantic import BaseModel, Field, root_validator
20
20
  from typing_extensions import Literal
21
21
 
22
22
 
@@ -227,14 +227,25 @@ class ChatCompletionMessageContentImageURL(BaseModel):
227
227
  detail: Optional[Literal["auto", "low", "high"]] = "auto"
228
228
 
229
229
 
230
+ class ChatCompletionMessageContentAudioURL(BaseModel):
231
+ url: str
232
+
233
+
230
234
  class ChatCompletionMessageContentImagePart(BaseModel):
231
235
  type: Literal["image_url"]
232
236
  image_url: ChatCompletionMessageContentImageURL
233
237
  modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
234
238
 
235
239
 
240
+ class ChatCompletionMessageContentAudioPart(BaseModel):
241
+ type: Literal["audio_url"]
242
+ audio_url: ChatCompletionMessageContentAudioURL
243
+
244
+
236
245
  ChatCompletionMessageContentPart = Union[
237
- ChatCompletionMessageContentTextPart, ChatCompletionMessageContentImagePart
246
+ ChatCompletionMessageContentTextPart,
247
+ ChatCompletionMessageContentImagePart,
248
+ ChatCompletionMessageContentAudioPart,
238
249
  ]
239
250
 
240
251
 
@@ -276,6 +287,7 @@ class Function(BaseModel):
276
287
  description: Optional[str] = Field(default=None, examples=[None])
277
288
  name: Optional[str] = None
278
289
  parameters: Optional[object] = None
290
+ strict: bool = False
279
291
 
280
292
 
281
293
  class Tool(BaseModel):
@@ -323,6 +335,15 @@ class ChatCompletionRequest(BaseModel):
323
335
  default="auto", examples=["none"]
324
336
  ) # noqa
325
337
 
338
+ @root_validator(pre=True)
339
+ def set_tool_choice_default(cls, values):
340
+ if values.get("tool_choice") is None:
341
+ if values.get("tools") is None:
342
+ values["tool_choice"] = "none"
343
+ else:
344
+ values["tool_choice"] = "auto"
345
+ return values
346
+
326
347
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
327
348
  top_k: int = -1
328
349
  min_p: float = 0.0
@@ -306,7 +306,7 @@ class SamplingBatchInfo:
306
306
  ]:
307
307
  self_val = getattr(self, item, None)
308
308
  other_val = getattr(other, item, None)
309
- setattr(self, item, torch.concat([self_val, other_val]))
309
+ setattr(self, item, torch.cat([self_val, other_val]))
310
310
 
311
311
  self.is_all_greedy |= other.is_all_greedy
312
312
  self.need_min_p_sampling |= other.need_min_p_sampling
@@ -77,7 +77,7 @@ class SamplingParams:
77
77
  self.custom_params = custom_params
78
78
 
79
79
  # Process some special cases
80
- if self.temperature < _SAMPLING_EPS:
80
+ if 0 <= self.temperature < _SAMPLING_EPS:
81
81
  # top_k = 1 means greedy sampling
82
82
  self.temperature = 1.0
83
83
  self.top_k = 1
@@ -93,9 +93,9 @@ class SamplingParams:
93
93
  raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
94
94
  if not 0.0 <= self.min_p <= 1.0:
95
95
  raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
96
- if self.top_k < -1 or self.top_k == 0:
96
+ if self.top_k < 1 or self.top_k == -1:
97
97
  raise ValueError(
98
- f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
98
+ f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
99
99
  )
100
100
  if not -2.0 <= self.frequency_penalty <= 2.0:
101
101
  raise ValueError(
@@ -108,12 +108,12 @@ class SamplingParams:
108
108
  )
109
109
  if not 0.0 <= self.repetition_penalty <= 2.0:
110
110
  raise ValueError(
111
- "repetition_penalty must be in (0, 2], got "
111
+ "repetition_penalty must be in [0, 2], got "
112
112
  f"{self.repetition_penalty}."
113
113
  )
114
114
  if not 0 <= self.min_new_tokens:
115
115
  raise ValueError(
116
- f"min_new_tokens must be in (0, max_new_tokens], got "
116
+ f"min_new_tokens must be in [0, max_new_tokens], got "
117
117
  f"{self.min_new_tokens}."
118
118
  )
119
119
  if self.max_new_tokens is not None:
@@ -123,7 +123,7 @@ class SamplingParams:
123
123
  )
124
124
  if not self.min_new_tokens <= self.max_new_tokens:
125
125
  raise ValueError(
126
- f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
126
+ f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
127
127
  f"{self.min_new_tokens}."
128
128
  )
129
129
  grammars = [