sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +3 -1
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +667 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +63 -11
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/parallel_state.py +10 -3
  34. sglang/srt/entrypoints/engine.py +55 -5
  35. sglang/srt/entrypoints/http_server.py +71 -12
  36. sglang/srt/function_call_parser.py +133 -54
  37. sglang/srt/hf_transformers_utils.py +28 -3
  38. sglang/srt/layers/activation.py +4 -2
  39. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +295 -0
  41. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  42. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  43. sglang/srt/layers/attention/triton_backend.py +171 -38
  44. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  45. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  46. sglang/srt/layers/attention/utils.py +53 -0
  47. sglang/srt/layers/attention/vision.py +9 -28
  48. sglang/srt/layers/dp_attention.py +32 -21
  49. sglang/srt/layers/layernorm.py +24 -2
  50. sglang/srt/layers/linear.py +17 -5
  51. sglang/srt/layers/logits_processor.py +25 -7
  52. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  53. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  54. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  55. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  61. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  62. sglang/srt/layers/moe/topk.py +31 -18
  63. sglang/srt/layers/parameter.py +1 -1
  64. sglang/srt/layers/quantization/__init__.py +184 -126
  65. sglang/srt/layers/quantization/base_config.py +5 -0
  66. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  67. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  68. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  69. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  70. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  71. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  72. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  73. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  74. sglang/srt/layers/quantization/fp8.py +76 -34
  75. sglang/srt/layers/quantization/fp8_kernel.py +24 -8
  76. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  77. sglang/srt/layers/quantization/gptq.py +36 -9
  78. sglang/srt/layers/quantization/kv_cache.py +98 -0
  79. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  80. sglang/srt/layers/quantization/utils.py +153 -0
  81. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  82. sglang/srt/layers/rotary_embedding.py +66 -87
  83. sglang/srt/layers/sampler.py +1 -1
  84. sglang/srt/lora/layers.py +68 -0
  85. sglang/srt/lora/lora.py +2 -22
  86. sglang/srt/lora/lora_manager.py +47 -23
  87. sglang/srt/lora/mem_pool.py +110 -51
  88. sglang/srt/lora/utils.py +12 -1
  89. sglang/srt/managers/cache_controller.py +2 -5
  90. sglang/srt/managers/data_parallel_controller.py +30 -8
  91. sglang/srt/managers/expert_distribution.py +81 -0
  92. sglang/srt/managers/io_struct.py +39 -3
  93. sglang/srt/managers/mm_utils.py +373 -0
  94. sglang/srt/managers/multimodal_processor.py +68 -0
  95. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  96. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  97. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  98. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  99. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  100. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  101. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  102. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  103. sglang/srt/managers/schedule_batch.py +133 -30
  104. sglang/srt/managers/scheduler.py +273 -20
  105. sglang/srt/managers/session_controller.py +1 -1
  106. sglang/srt/managers/tokenizer_manager.py +59 -23
  107. sglang/srt/managers/tp_worker.py +1 -1
  108. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  109. sglang/srt/managers/utils.py +6 -1
  110. sglang/srt/mem_cache/hiradix_cache.py +18 -7
  111. sglang/srt/mem_cache/memory_pool.py +255 -98
  112. sglang/srt/mem_cache/paged_allocator.py +2 -2
  113. sglang/srt/mem_cache/radix_cache.py +4 -4
  114. sglang/srt/model_executor/cuda_graph_runner.py +27 -13
  115. sglang/srt/model_executor/forward_batch_info.py +68 -11
  116. sglang/srt/model_executor/model_runner.py +70 -6
  117. sglang/srt/model_loader/loader.py +160 -2
  118. sglang/srt/model_loader/weight_utils.py +45 -0
  119. sglang/srt/models/deepseek_janus_pro.py +29 -86
  120. sglang/srt/models/deepseek_nextn.py +22 -10
  121. sglang/srt/models/deepseek_v2.py +208 -77
  122. sglang/srt/models/deepseek_vl2.py +358 -0
  123. sglang/srt/models/gemma3_causal.py +684 -0
  124. sglang/srt/models/gemma3_mm.py +462 -0
  125. sglang/srt/models/llama.py +47 -7
  126. sglang/srt/models/llama_eagle.py +1 -0
  127. sglang/srt/models/llama_eagle3.py +196 -0
  128. sglang/srt/models/llava.py +3 -3
  129. sglang/srt/models/llavavid.py +3 -3
  130. sglang/srt/models/minicpmo.py +1995 -0
  131. sglang/srt/models/minicpmv.py +62 -137
  132. sglang/srt/models/mllama.py +4 -4
  133. sglang/srt/models/phi3_small.py +1 -1
  134. sglang/srt/models/qwen2.py +3 -0
  135. sglang/srt/models/qwen2_5_vl.py +68 -146
  136. sglang/srt/models/qwen2_classification.py +75 -0
  137. sglang/srt/models/qwen2_moe.py +9 -1
  138. sglang/srt/models/qwen2_vl.py +25 -63
  139. sglang/srt/openai_api/adapter.py +124 -28
  140. sglang/srt/openai_api/protocol.py +23 -2
  141. sglang/srt/sampling/sampling_batch_info.py +1 -1
  142. sglang/srt/sampling/sampling_params.py +6 -6
  143. sglang/srt/server_args.py +99 -9
  144. sglang/srt/speculative/build_eagle_tree.py +7 -347
  145. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  146. sglang/srt/speculative/eagle_utils.py +208 -252
  147. sglang/srt/speculative/eagle_worker.py +139 -53
  148. sglang/srt/speculative/spec_info.py +6 -1
  149. sglang/srt/torch_memory_saver_adapter.py +22 -0
  150. sglang/srt/utils.py +182 -21
  151. sglang/test/__init__.py +0 -0
  152. sglang/test/attention/__init__.py +0 -0
  153. sglang/test/attention/test_flashattn_backend.py +312 -0
  154. sglang/test/runners.py +2 -0
  155. sglang/test/test_activation.py +2 -1
  156. sglang/test/test_block_fp8.py +5 -4
  157. sglang/test/test_block_fp8_ep.py +2 -1
  158. sglang/test/test_dynamic_grad_mode.py +58 -0
  159. sglang/test/test_layernorm.py +3 -2
  160. sglang/test/test_utils.py +55 -4
  161. sglang/utils.py +31 -0
  162. sglang/version.py +1 -1
  163. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
  164. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +167 -123
  165. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
  166. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  167. sglang/srt/managers/image_processor.py +0 -55
  168. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  169. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  170. sglang/srt/managers/multi_modality_padding.py +0 -134
  171. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
  172. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
@@ -20,19 +20,16 @@ import os
20
20
  import time
21
21
  import uuid
22
22
  from http import HTTPStatus
23
- from typing import Dict, List
23
+ from typing import Any, Dict, List, Set
24
24
 
25
25
  from fastapi import HTTPException, Request, UploadFile
26
26
  from fastapi.responses import ORJSONResponse, StreamingResponse
27
27
  from pydantic import ValidationError
28
28
 
29
- try:
30
- from outlines.fsm.json_schema import convert_json_schema_to_str
31
- except ImportError:
32
- # Before outlines 0.0.47, convert_json_schema_to_str is under
33
- # outlines.integrations.utils
34
- from outlines.integrations.utils import convert_json_schema_to_str
35
-
29
+ from sglang.srt.code_completion_parser import (
30
+ generate_completion_prompt_from_request,
31
+ is_completion_template_defined,
32
+ )
36
33
  from sglang.srt.conversation import (
37
34
  Conversation,
38
35
  SeparatorStyle,
@@ -41,7 +38,7 @@ from sglang.srt.conversation import (
41
38
  generate_embedding_convs,
42
39
  register_conv_template,
43
40
  )
44
- from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser
41
+ from sglang.srt.function_call_parser import FunctionCallParser
45
42
  from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
46
43
  from sglang.srt.openai_api.protocol import (
47
44
  BatchRequest,
@@ -75,7 +72,7 @@ from sglang.srt.openai_api.protocol import (
75
72
  UsageInfo,
76
73
  )
77
74
  from sglang.srt.reasoning_parser import ReasoningParser
78
- from sglang.utils import get_exception_traceback
75
+ from sglang.utils import convert_json_schema_to_str, get_exception_traceback
79
76
 
80
77
  logger = logging.getLogger(__name__)
81
78
 
@@ -310,6 +307,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
310
307
  )
311
308
 
312
309
  try:
310
+ created = int(time.time())
313
311
  ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
314
312
  if not isinstance(ret, list):
315
313
  ret = [ret]
@@ -317,13 +315,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
317
315
  responses = v1_chat_generate_response(
318
316
  request,
319
317
  ret,
318
+ created,
320
319
  to_file=True,
321
320
  cache_report=tokenizer_manager.server_args.enable_cache_report,
322
321
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
323
322
  )
324
323
  else:
325
324
  responses = v1_generate_response(
326
- request, ret, tokenizer_manager, to_file=True
325
+ request,
326
+ ret,
327
+ tokenizer_manager,
328
+ created,
329
+ to_file=True,
330
+ cache_report=tokenizer_manager.server_args.enable_cache_report,
327
331
  )
328
332
 
329
333
  except Exception as e:
@@ -504,7 +508,11 @@ def v1_generate_request(
504
508
  "To compute logprobs of input prompt, please use the native /generate API."
505
509
  )
506
510
 
507
- prompts.append(request.prompt)
511
+ prompt = request.prompt
512
+ if is_completion_template_defined():
513
+ prompt = generate_completion_prompt_from_request(request)
514
+ prompts.append(prompt)
515
+
508
516
  lora_paths.append(request.lora_path)
509
517
  if request.echo and request.logprobs:
510
518
  current_logprob_start_len = 0
@@ -569,7 +577,9 @@ def v1_generate_request(
569
577
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
570
578
 
571
579
 
572
- def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
580
+ def v1_generate_response(
581
+ request, ret, tokenizer_manager, created, to_file=False, cache_report=False
582
+ ):
573
583
  choices = []
574
584
  echo = False
575
585
 
@@ -667,7 +677,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
667
677
  # remain the same but if needed we can change that
668
678
  "id": ret[i]["meta_info"]["id"],
669
679
  "object": "text_completion",
670
- "created": int(time.time()),
680
+ "created": created,
671
681
  "model": request[i].model,
672
682
  "choices": choice,
673
683
  "usage": {
@@ -686,14 +696,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
686
696
  ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
687
697
  )
688
698
  completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
699
+ cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
689
700
  response = CompletionResponse(
690
701
  id=ret[0]["meta_info"]["id"],
691
702
  model=request.model,
703
+ created=created,
692
704
  choices=choices,
693
705
  usage=UsageInfo(
694
706
  prompt_tokens=prompt_tokens,
695
707
  completion_tokens=completion_tokens,
696
708
  total_tokens=prompt_tokens + completion_tokens,
709
+ prompt_tokens_details=(
710
+ {"cached_tokens": cached_tokens} if cache_report else None
711
+ ),
697
712
  ),
698
713
  )
699
714
  return response
@@ -702,6 +717,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
702
717
  async def v1_completions(tokenizer_manager, raw_request: Request):
703
718
  request_json = await raw_request.json()
704
719
  all_requests = [CompletionRequest(**request_json)]
720
+ created = int(time.time())
705
721
  adapted_request, request = v1_generate_request(all_requests)
706
722
 
707
723
  if adapted_request.stream:
@@ -711,6 +727,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
711
727
  n_prev_tokens = {}
712
728
  prompt_tokens = {}
713
729
  completion_tokens = {}
730
+ cached_tokens = {}
731
+
714
732
  try:
715
733
  async for content in tokenizer_manager.generate_request(
716
734
  adapted_request, raw_request
@@ -723,6 +741,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
723
741
  text = content["text"]
724
742
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
725
743
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
744
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
726
745
 
727
746
  if not stream_buffer: # The first chunk
728
747
  if request.echo:
@@ -795,6 +814,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
795
814
  )
796
815
  chunk = CompletionStreamResponse(
797
816
  id=content["meta_info"]["id"],
817
+ created=created,
798
818
  object="text_completion",
799
819
  choices=[choice_data],
800
820
  model=request.model,
@@ -813,14 +833,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
813
833
  total_completion_tokens = sum(
814
834
  tokens for tokens in completion_tokens.values()
815
835
  )
836
+ cache_report = tokenizer_manager.server_args.enable_cache_report
837
+ if cache_report:
838
+ cached_tokens_sum = sum(
839
+ tokens for tokens in cached_tokens.values()
840
+ )
841
+ prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
842
+ else:
843
+ prompt_tokens_details = None
816
844
  usage = UsageInfo(
817
845
  prompt_tokens=total_prompt_tokens,
818
846
  completion_tokens=total_completion_tokens,
819
847
  total_tokens=total_prompt_tokens + total_completion_tokens,
848
+ prompt_tokens_details=prompt_tokens_details,
820
849
  )
821
850
 
822
851
  final_usage_chunk = CompletionStreamResponse(
823
852
  id=content["meta_info"]["id"],
853
+ created=created,
824
854
  choices=[],
825
855
  model=request.model,
826
856
  usage=usage,
@@ -851,7 +881,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
851
881
  if not isinstance(ret, list):
852
882
  ret = [ret]
853
883
 
854
- response = v1_generate_response(request, ret, tokenizer_manager)
884
+ response = v1_generate_response(
885
+ request,
886
+ ret,
887
+ tokenizer_manager,
888
+ created,
889
+ cache_report=tokenizer_manager.server_args.enable_cache_report,
890
+ )
855
891
  return response
856
892
 
857
893
 
@@ -863,6 +899,7 @@ def v1_chat_generate_request(
863
899
  input_ids = []
864
900
  sampling_params_list = []
865
901
  image_data_list = []
902
+ audio_data_list = []
866
903
  return_logprobs = []
867
904
  logprob_start_lens = []
868
905
  top_logprobs_nums = []
@@ -876,7 +913,9 @@ def v1_chat_generate_request(
876
913
  # - prompt: The full prompt string.
877
914
  # - stop: Custom stop tokens.
878
915
  # - image_data: None or a list of image strings (URLs or base64 strings).
916
+ # - audio_data: None or a list of audio strings (URLs).
879
917
  # None skips any image processing in GenerateReqInput.
918
+ strict_tag = None
880
919
  if not isinstance(request.messages, str):
881
920
  # Apply chat template and its stop strings.
882
921
  tools = None
@@ -891,6 +930,10 @@ def v1_chat_generate_request(
891
930
  else:
892
931
  tools = [item.function.model_dump() for item in request.tools]
893
932
 
933
+ tool_call_parser = tokenizer_manager.server_args.tool_call_parser
934
+ parser = FunctionCallParser(request.tools, tool_call_parser)
935
+ strict_tag = parser.get_structure_tag()
936
+
894
937
  if chat_template_name is None:
895
938
  openai_compatible_messages = []
896
939
  for message in request.messages:
@@ -920,7 +963,7 @@ def v1_chat_generate_request(
920
963
  )
921
964
  except:
922
965
  # This except branch will be triggered when the chosen model
923
- # has a different tools input format that is not compatiable
966
+ # has a different tools input format that is not compatible
924
967
  # with openAI's apply_chat_template tool_call format, like Mistral.
925
968
  tools = [t if "function" in t else {"function": t} for t in tools]
926
969
  prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
@@ -940,11 +983,13 @@ def v1_chat_generate_request(
940
983
  prompt_ids += encoded
941
984
  stop = request.stop
942
985
  image_data = None
986
+ audio_data = None
943
987
  modalities = []
944
988
  else:
945
989
  conv = generate_chat_conv(request, chat_template_name)
946
990
  prompt = conv.get_prompt()
947
991
  image_data = conv.image_data
992
+ audio_data = conv.audio_data
948
993
  modalities = conv.modalities
949
994
  stop = conv.stop_str or []
950
995
  if request.stop:
@@ -958,6 +1003,7 @@ def v1_chat_generate_request(
958
1003
  prompt_ids = request.messages
959
1004
  stop = request.stop
960
1005
  image_data = None
1006
+ audio_data = None
961
1007
  modalities = []
962
1008
  input_ids.append(prompt_ids)
963
1009
  return_logprobs.append(request.logprobs)
@@ -995,9 +1041,26 @@ def v1_chat_generate_request(
995
1041
  sampling_params["structural_tag"] = convert_json_schema_to_str(
996
1042
  request.response_format.model_dump(by_alias=True)
997
1043
  )
1044
+
1045
+ if strict_tag is not None:
1046
+ if (
1047
+ sampling_params.get("regex")
1048
+ or sampling_params.get("ebnf")
1049
+ or sampling_params.get("structural_tag")
1050
+ or sampling_params.get("json_schema")
1051
+ ):
1052
+ logger.warning(
1053
+ "Constrained decoding is not compatible with tool calls."
1054
+ )
1055
+ else:
1056
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
1057
+ strict_tag.model_dump(by_alias=True)
1058
+ )
1059
+
998
1060
  sampling_params_list.append(sampling_params)
999
1061
 
1000
1062
  image_data_list.append(image_data)
1063
+ audio_data_list.append(audio_data)
1001
1064
  modalities_list.append(modalities)
1002
1065
  if len(all_requests) == 1:
1003
1066
  if isinstance(input_ids[0], str):
@@ -1006,6 +1069,7 @@ def v1_chat_generate_request(
1006
1069
  prompt_kwargs = {"input_ids": input_ids[0]}
1007
1070
  sampling_params_list = sampling_params_list[0]
1008
1071
  image_data_list = image_data_list[0]
1072
+ audio_data_list = audio_data_list[0]
1009
1073
  return_logprobs = return_logprobs[0]
1010
1074
  logprob_start_lens = logprob_start_lens[0]
1011
1075
  top_logprobs_nums = top_logprobs_nums[0]
@@ -1020,6 +1084,7 @@ def v1_chat_generate_request(
1020
1084
  adapted_request = GenerateReqInput(
1021
1085
  **prompt_kwargs,
1022
1086
  image_data=image_data_list,
1087
+ audio_data=audio_data_list,
1023
1088
  sampling_params=sampling_params_list,
1024
1089
  return_logprob=return_logprobs,
1025
1090
  logprob_start_len=logprob_start_lens,
@@ -1037,6 +1102,7 @@ def v1_chat_generate_request(
1037
1102
  def v1_chat_generate_response(
1038
1103
  request,
1039
1104
  ret,
1105
+ created,
1040
1106
  to_file=False,
1041
1107
  cache_report=False,
1042
1108
  tool_call_parser=None,
@@ -1122,7 +1188,7 @@ def v1_chat_generate_response(
1122
1188
  finish_reason["type"] = "tool_calls"
1123
1189
  finish_reason["matched"] = None
1124
1190
  try:
1125
- full_normal_text, call_info_list = parser.parse_non_stream(text)
1191
+ text, call_info_list = parser.parse_non_stream(text)
1126
1192
  tool_calls = [
1127
1193
  ToolCall(
1128
1194
  id=str(call_info.tool_index),
@@ -1145,9 +1211,9 @@ def v1_chat_generate_response(
1145
1211
  "index": 0,
1146
1212
  "message": {
1147
1213
  "role": "assistant",
1148
- "content": text if tool_calls is None else None,
1214
+ "content": text if text else None,
1149
1215
  "tool_calls": tool_calls,
1150
- "reasoning_content": reasoning_text,
1216
+ "reasoning_content": reasoning_text if reasoning_text else None,
1151
1217
  },
1152
1218
  "logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
1153
1219
  "finish_reason": (finish_reason["type"] if finish_reason else ""),
@@ -1162,9 +1228,9 @@ def v1_chat_generate_response(
1162
1228
  index=idx,
1163
1229
  message=ChatMessage(
1164
1230
  role="assistant",
1165
- content=text if tool_calls is None else None,
1231
+ content=text if text else None,
1166
1232
  tool_calls=tool_calls,
1167
- reasoning_content=reasoning_text,
1233
+ reasoning_content=reasoning_text if reasoning_text else None,
1168
1234
  ),
1169
1235
  logprobs=choice_logprobs,
1170
1236
  finish_reason=(finish_reason["type"] if finish_reason else ""),
@@ -1188,7 +1254,7 @@ def v1_chat_generate_response(
1188
1254
  # remain the same but if needed we can change that
1189
1255
  "id": ret[i]["meta_info"]["id"],
1190
1256
  "object": "chat.completion",
1191
- "created": int(time.time()),
1257
+ "created": created,
1192
1258
  "model": request[i].model,
1193
1259
  "choices": choice,
1194
1260
  "usage": {
@@ -1210,6 +1276,7 @@ def v1_chat_generate_response(
1210
1276
  cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
1211
1277
  response = ChatCompletionResponse(
1212
1278
  id=ret[0]["meta_info"]["id"],
1279
+ created=created,
1213
1280
  model=request.model,
1214
1281
  choices=choices,
1215
1282
  usage=UsageInfo(
@@ -1224,9 +1291,12 @@ def v1_chat_generate_response(
1224
1291
  return response
1225
1292
 
1226
1293
 
1227
- async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1294
+ async def v1_chat_completions(
1295
+ tokenizer_manager, raw_request: Request, cache_report=False
1296
+ ):
1228
1297
  request_json = await raw_request.json()
1229
1298
  all_requests = [ChatCompletionRequest(**request_json)]
1299
+ created = int(time.time())
1230
1300
  adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
1231
1301
 
1232
1302
  if adapted_request.stream:
@@ -1239,6 +1309,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1239
1309
  n_prev_tokens = {}
1240
1310
  prompt_tokens = {}
1241
1311
  completion_tokens = {}
1312
+ cached_tokens = {}
1242
1313
  try:
1243
1314
  async for content in tokenizer_manager.generate_request(
1244
1315
  adapted_request, raw_request
@@ -1252,6 +1323,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1252
1323
 
1253
1324
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
1254
1325
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
1326
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
1255
1327
  if request.logprobs:
1256
1328
  logprobs = to_openai_style_logprobs(
1257
1329
  output_token_logprobs=content["meta_info"][
@@ -1309,9 +1381,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1309
1381
  tokenizer_manager.server_args.reasoning_parser
1310
1382
  and request.separate_reasoning
1311
1383
  ):
1312
- delta = DeltaMessage(role="assistant", reasoning_content="")
1384
+ delta = DeltaMessage(
1385
+ role="assistant", reasoning_content=None
1386
+ )
1313
1387
  else:
1314
- delta = DeltaMessage(role="assistant", content="")
1388
+ delta = DeltaMessage(role="assistant", content=None)
1315
1389
  choice_data = ChatCompletionResponseStreamChoice(
1316
1390
  index=index,
1317
1391
  delta=delta,
@@ -1329,6 +1403,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1329
1403
  )
1330
1404
  chunk = ChatCompletionStreamResponse(
1331
1405
  id=content["meta_info"]["id"],
1406
+ created=created,
1332
1407
  choices=[choice_data],
1333
1408
  model=request.model,
1334
1409
  )
@@ -1354,7 +1429,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1354
1429
  if reasoning_text:
1355
1430
  choice_data = ChatCompletionResponseStreamChoice(
1356
1431
  index=index,
1357
- delta=DeltaMessage(reasoning_content=reasoning_text),
1432
+ delta=DeltaMessage(
1433
+ reasoning_content=(
1434
+ reasoning_text if reasoning_text else None
1435
+ )
1436
+ ),
1358
1437
  finish_reason=(
1359
1438
  None
1360
1439
  if finish_reason_type
@@ -1364,6 +1443,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1364
1443
  )
1365
1444
  chunk = ChatCompletionStreamResponse(
1366
1445
  id=content["meta_info"]["id"],
1446
+ created=created,
1367
1447
  choices=[choice_data],
1368
1448
  model=request.model,
1369
1449
  )
@@ -1388,7 +1468,9 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1388
1468
  if normal_text:
1389
1469
  choice_data = ChatCompletionResponseStreamChoice(
1390
1470
  index=index,
1391
- delta=DeltaMessage(content=normal_text),
1471
+ delta=DeltaMessage(
1472
+ content=normal_text if normal_text else None
1473
+ ),
1392
1474
  finish_reason=(
1393
1475
  None
1394
1476
  if finish_reason_type
@@ -1398,6 +1480,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1398
1480
  )
1399
1481
  chunk = ChatCompletionStreamResponse(
1400
1482
  id=content["meta_info"]["id"],
1483
+ created=created,
1401
1484
  choices=[choice_data],
1402
1485
  model=request.model,
1403
1486
  )
@@ -1448,6 +1531,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1448
1531
  )
1449
1532
  chunk = ChatCompletionStreamResponse(
1450
1533
  id=content["meta_info"]["id"],
1534
+ created=created,
1451
1535
  choices=[choice_data],
1452
1536
  model=request.model,
1453
1537
  )
@@ -1460,7 +1544,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1460
1544
  # No tool calls => just treat this as normal text
1461
1545
  choice_data = ChatCompletionResponseStreamChoice(
1462
1546
  index=index,
1463
- delta=DeltaMessage(content=delta),
1547
+ delta=DeltaMessage(content=delta if delta else None),
1464
1548
  finish_reason=(
1465
1549
  None
1466
1550
  if finish_reason_type and len(finish_reason_type) == 0
@@ -1475,6 +1559,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1475
1559
  )
1476
1560
  chunk = ChatCompletionStreamResponse(
1477
1561
  id=content["meta_info"]["id"],
1562
+ created=created,
1478
1563
  choices=[choice_data],
1479
1564
  model=request.model,
1480
1565
  )
@@ -1490,14 +1575,24 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1490
1575
  total_completion_tokens = sum(
1491
1576
  tokens for tokens in completion_tokens.values()
1492
1577
  )
1578
+ cache_report = tokenizer_manager.server_args.enable_cache_report
1579
+ if cache_report:
1580
+ cached_tokens_sum = sum(
1581
+ tokens for tokens in cached_tokens.values()
1582
+ )
1583
+ prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
1584
+ else:
1585
+ prompt_tokens_details = None
1493
1586
  usage = UsageInfo(
1494
1587
  prompt_tokens=total_prompt_tokens,
1495
1588
  completion_tokens=total_completion_tokens,
1496
1589
  total_tokens=total_prompt_tokens + total_completion_tokens,
1590
+ prompt_tokens_details=prompt_tokens_details,
1497
1591
  )
1498
1592
 
1499
1593
  final_usage_chunk = ChatCompletionStreamResponse(
1500
1594
  id=content["meta_info"]["id"],
1595
+ created=created,
1501
1596
  choices=[],
1502
1597
  model=request.model,
1503
1598
  usage=usage,
@@ -1530,6 +1625,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1530
1625
  response = v1_chat_generate_response(
1531
1626
  request,
1532
1627
  ret,
1628
+ created,
1533
1629
  cache_report=tokenizer_manager.server_args.enable_cache_report,
1534
1630
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
1535
1631
  reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
@@ -16,7 +16,7 @@
16
16
  import time
17
17
  from typing import Dict, List, Optional, Union
18
18
 
19
- from pydantic import BaseModel, Field
19
+ from pydantic import BaseModel, Field, root_validator
20
20
  from typing_extensions import Literal
21
21
 
22
22
 
@@ -227,14 +227,25 @@ class ChatCompletionMessageContentImageURL(BaseModel):
227
227
  detail: Optional[Literal["auto", "low", "high"]] = "auto"
228
228
 
229
229
 
230
+ class ChatCompletionMessageContentAudioURL(BaseModel):
231
+ url: str
232
+
233
+
230
234
  class ChatCompletionMessageContentImagePart(BaseModel):
231
235
  type: Literal["image_url"]
232
236
  image_url: ChatCompletionMessageContentImageURL
233
237
  modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
234
238
 
235
239
 
240
+ class ChatCompletionMessageContentAudioPart(BaseModel):
241
+ type: Literal["audio_url"]
242
+ audio_url: ChatCompletionMessageContentAudioURL
243
+
244
+
236
245
  ChatCompletionMessageContentPart = Union[
237
- ChatCompletionMessageContentTextPart, ChatCompletionMessageContentImagePart
246
+ ChatCompletionMessageContentTextPart,
247
+ ChatCompletionMessageContentImagePart,
248
+ ChatCompletionMessageContentAudioPart,
238
249
  ]
239
250
 
240
251
 
@@ -276,6 +287,7 @@ class Function(BaseModel):
276
287
  description: Optional[str] = Field(default=None, examples=[None])
277
288
  name: Optional[str] = None
278
289
  parameters: Optional[object] = None
290
+ strict: bool = False
279
291
 
280
292
 
281
293
  class Tool(BaseModel):
@@ -323,6 +335,15 @@ class ChatCompletionRequest(BaseModel):
323
335
  default="auto", examples=["none"]
324
336
  ) # noqa
325
337
 
338
+ @root_validator(pre=True)
339
+ def set_tool_choice_default(cls, values):
340
+ if values.get("tool_choice") is None:
341
+ if values.get("tools") is None:
342
+ values["tool_choice"] = "none"
343
+ else:
344
+ values["tool_choice"] = "auto"
345
+ return values
346
+
326
347
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
327
348
  top_k: int = -1
328
349
  min_p: float = 0.0
@@ -306,7 +306,7 @@ class SamplingBatchInfo:
306
306
  ]:
307
307
  self_val = getattr(self, item, None)
308
308
  other_val = getattr(other, item, None)
309
- setattr(self, item, torch.concat([self_val, other_val]))
309
+ setattr(self, item, torch.cat([self_val, other_val]))
310
310
 
311
311
  self.is_all_greedy |= other.is_all_greedy
312
312
  self.need_min_p_sampling |= other.need_min_p_sampling
@@ -77,7 +77,7 @@ class SamplingParams:
77
77
  self.custom_params = custom_params
78
78
 
79
79
  # Process some special cases
80
- if self.temperature < _SAMPLING_EPS:
80
+ if 0 <= self.temperature < _SAMPLING_EPS:
81
81
  # top_k = 1 means greedy sampling
82
82
  self.temperature = 1.0
83
83
  self.top_k = 1
@@ -93,9 +93,9 @@ class SamplingParams:
93
93
  raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
94
94
  if not 0.0 <= self.min_p <= 1.0:
95
95
  raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
96
- if self.top_k < -1 or self.top_k == 0:
96
+ if self.top_k < 1 or self.top_k == -1:
97
97
  raise ValueError(
98
- f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
98
+ f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
99
99
  )
100
100
  if not -2.0 <= self.frequency_penalty <= 2.0:
101
101
  raise ValueError(
@@ -108,12 +108,12 @@ class SamplingParams:
108
108
  )
109
109
  if not 0.0 <= self.repetition_penalty <= 2.0:
110
110
  raise ValueError(
111
- "repetition_penalty must be in (0, 2], got "
111
+ "repetition_penalty must be in [0, 2], got "
112
112
  f"{self.repetition_penalty}."
113
113
  )
114
114
  if not 0 <= self.min_new_tokens:
115
115
  raise ValueError(
116
- f"min_new_tokens must be in (0, max_new_tokens], got "
116
+ f"min_new_tokens must be in [0, max_new_tokens], got "
117
117
  f"{self.min_new_tokens}."
118
118
  )
119
119
  if self.max_new_tokens is not None:
@@ -123,7 +123,7 @@ class SamplingParams:
123
123
  )
124
124
  if not self.min_new_tokens <= self.max_new_tokens:
125
125
  raise ValueError(
126
- f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
126
+ f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
127
127
  f"{self.min_new_tokens}."
128
128
  )
129
129
  grammars = [