sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +26 -4
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +676 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +49 -8
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  34. sglang/srt/distributed/parallel_state.py +42 -8
  35. sglang/srt/entrypoints/engine.py +55 -5
  36. sglang/srt/entrypoints/http_server.py +78 -13
  37. sglang/srt/entrypoints/verl_engine.py +2 -0
  38. sglang/srt/function_call_parser.py +133 -55
  39. sglang/srt/hf_transformers_utils.py +28 -3
  40. sglang/srt/layers/activation.py +4 -2
  41. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  42. sglang/srt/layers/attention/flashattention_backend.py +434 -0
  43. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  44. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  45. sglang/srt/layers/attention/triton_backend.py +171 -38
  46. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  47. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  48. sglang/srt/layers/attention/utils.py +53 -0
  49. sglang/srt/layers/attention/vision.py +9 -28
  50. sglang/srt/layers/dp_attention.py +41 -19
  51. sglang/srt/layers/layernorm.py +24 -2
  52. sglang/srt/layers/linear.py +17 -5
  53. sglang/srt/layers/logits_processor.py +25 -7
  54. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  55. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  56. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  57. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  64. sglang/srt/layers/moe/topk.py +60 -20
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +80 -53
  67. sglang/srt/layers/quantization/awq.py +200 -0
  68. sglang/srt/layers/quantization/base_config.py +5 -0
  69. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  70. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  72. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  75. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  76. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  77. sglang/srt/layers/quantization/fp8.py +76 -34
  78. sglang/srt/layers/quantization/fp8_kernel.py +25 -8
  79. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  80. sglang/srt/layers/quantization/gptq.py +36 -19
  81. sglang/srt/layers/quantization/kv_cache.py +98 -0
  82. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  83. sglang/srt/layers/quantization/utils.py +153 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  85. sglang/srt/layers/rotary_embedding.py +78 -87
  86. sglang/srt/layers/sampler.py +1 -1
  87. sglang/srt/lora/backend/base_backend.py +4 -4
  88. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  89. sglang/srt/lora/backend/triton_backend.py +5 -8
  90. sglang/srt/lora/layers.py +87 -33
  91. sglang/srt/lora/lora.py +2 -22
  92. sglang/srt/lora/lora_manager.py +67 -30
  93. sglang/srt/lora/mem_pool.py +117 -52
  94. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  95. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  96. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  97. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  98. sglang/srt/lora/utils.py +18 -1
  99. sglang/srt/managers/cache_controller.py +2 -5
  100. sglang/srt/managers/data_parallel_controller.py +30 -8
  101. sglang/srt/managers/expert_distribution.py +81 -0
  102. sglang/srt/managers/io_struct.py +43 -5
  103. sglang/srt/managers/mm_utils.py +373 -0
  104. sglang/srt/managers/multimodal_processor.py +68 -0
  105. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  106. sglang/srt/managers/multimodal_processors/clip.py +63 -0
  107. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  108. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  109. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  110. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  111. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  112. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  113. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  114. sglang/srt/managers/schedule_batch.py +134 -30
  115. sglang/srt/managers/scheduler.py +290 -31
  116. sglang/srt/managers/session_controller.py +1 -1
  117. sglang/srt/managers/tokenizer_manager.py +59 -24
  118. sglang/srt/managers/tp_worker.py +4 -1
  119. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  120. sglang/srt/managers/utils.py +6 -1
  121. sglang/srt/mem_cache/hiradix_cache.py +18 -7
  122. sglang/srt/mem_cache/memory_pool.py +255 -98
  123. sglang/srt/mem_cache/paged_allocator.py +2 -2
  124. sglang/srt/mem_cache/radix_cache.py +4 -4
  125. sglang/srt/model_executor/cuda_graph_runner.py +36 -21
  126. sglang/srt/model_executor/forward_batch_info.py +68 -11
  127. sglang/srt/model_executor/model_runner.py +75 -8
  128. sglang/srt/model_loader/loader.py +171 -3
  129. sglang/srt/model_loader/weight_utils.py +51 -3
  130. sglang/srt/models/clip.py +563 -0
  131. sglang/srt/models/deepseek_janus_pro.py +31 -88
  132. sglang/srt/models/deepseek_nextn.py +22 -10
  133. sglang/srt/models/deepseek_v2.py +329 -73
  134. sglang/srt/models/deepseek_vl2.py +358 -0
  135. sglang/srt/models/gemma3_causal.py +694 -0
  136. sglang/srt/models/gemma3_mm.py +468 -0
  137. sglang/srt/models/llama.py +47 -7
  138. sglang/srt/models/llama_eagle.py +1 -0
  139. sglang/srt/models/llama_eagle3.py +196 -0
  140. sglang/srt/models/llava.py +3 -3
  141. sglang/srt/models/llavavid.py +3 -3
  142. sglang/srt/models/minicpmo.py +1995 -0
  143. sglang/srt/models/minicpmv.py +62 -137
  144. sglang/srt/models/mllama.py +4 -4
  145. sglang/srt/models/phi3_small.py +1 -1
  146. sglang/srt/models/qwen2.py +3 -0
  147. sglang/srt/models/qwen2_5_vl.py +68 -146
  148. sglang/srt/models/qwen2_classification.py +75 -0
  149. sglang/srt/models/qwen2_moe.py +9 -1
  150. sglang/srt/models/qwen2_vl.py +25 -63
  151. sglang/srt/openai_api/adapter.py +201 -104
  152. sglang/srt/openai_api/protocol.py +33 -7
  153. sglang/srt/patch_torch.py +71 -0
  154. sglang/srt/sampling/sampling_batch_info.py +1 -1
  155. sglang/srt/sampling/sampling_params.py +6 -6
  156. sglang/srt/server_args.py +114 -14
  157. sglang/srt/speculative/build_eagle_tree.py +7 -347
  158. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  159. sglang/srt/speculative/eagle_utils.py +208 -252
  160. sglang/srt/speculative/eagle_worker.py +140 -54
  161. sglang/srt/speculative/spec_info.py +6 -1
  162. sglang/srt/torch_memory_saver_adapter.py +22 -0
  163. sglang/srt/utils.py +215 -21
  164. sglang/test/__init__.py +0 -0
  165. sglang/test/attention/__init__.py +0 -0
  166. sglang/test/attention/test_flashattn_backend.py +312 -0
  167. sglang/test/runners.py +29 -2
  168. sglang/test/test_activation.py +2 -1
  169. sglang/test/test_block_fp8.py +5 -4
  170. sglang/test/test_block_fp8_ep.py +2 -1
  171. sglang/test/test_dynamic_grad_mode.py +58 -0
  172. sglang/test/test_layernorm.py +3 -2
  173. sglang/test/test_utils.py +56 -5
  174. sglang/utils.py +31 -0
  175. sglang/version.py +1 -1
  176. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
  177. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
  178. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
  179. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  180. sglang/srt/managers/image_processor.py +0 -55
  181. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  182. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  183. sglang/srt/managers/multi_modality_padding.py +0 -134
  184. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
  185. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
@@ -26,13 +26,10 @@ from fastapi import HTTPException, Request, UploadFile
26
26
  from fastapi.responses import ORJSONResponse, StreamingResponse
27
27
  from pydantic import ValidationError
28
28
 
29
- try:
30
- from outlines.fsm.json_schema import convert_json_schema_to_str
31
- except ImportError:
32
- # Before outlines 0.0.47, convert_json_schema_to_str is under
33
- # outlines.integrations.utils
34
- from outlines.integrations.utils import convert_json_schema_to_str
35
-
29
+ from sglang.srt.code_completion_parser import (
30
+ generate_completion_prompt_from_request,
31
+ is_completion_template_defined,
32
+ )
36
33
  from sglang.srt.conversation import (
37
34
  Conversation,
38
35
  SeparatorStyle,
@@ -41,7 +38,7 @@ from sglang.srt.conversation import (
41
38
  generate_embedding_convs,
42
39
  register_conv_template,
43
40
  )
44
- from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser
41
+ from sglang.srt.function_call_parser import FunctionCallParser
45
42
  from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
46
43
  from sglang.srt.openai_api.protocol import (
47
44
  BatchRequest,
@@ -75,7 +72,7 @@ from sglang.srt.openai_api.protocol import (
75
72
  UsageInfo,
76
73
  )
77
74
  from sglang.srt.reasoning_parser import ReasoningParser
78
- from sglang.utils import get_exception_traceback
75
+ from sglang.utils import convert_json_schema_to_str, get_exception_traceback
79
76
 
80
77
  logger = logging.getLogger(__name__)
81
78
 
@@ -310,6 +307,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
310
307
  )
311
308
 
312
309
  try:
310
+ created = int(time.time())
313
311
  ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
314
312
  if not isinstance(ret, list):
315
313
  ret = [ret]
@@ -317,13 +315,19 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
317
315
  responses = v1_chat_generate_response(
318
316
  request,
319
317
  ret,
318
+ created,
320
319
  to_file=True,
321
320
  cache_report=tokenizer_manager.server_args.enable_cache_report,
322
321
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
323
322
  )
324
323
  else:
325
324
  responses = v1_generate_response(
326
- request, ret, tokenizer_manager, to_file=True
325
+ request,
326
+ ret,
327
+ tokenizer_manager,
328
+ created,
329
+ to_file=True,
330
+ cache_report=tokenizer_manager.server_args.enable_cache_report,
327
331
  )
328
332
 
329
333
  except Exception as e:
@@ -504,7 +508,11 @@ def v1_generate_request(
504
508
  "To compute logprobs of input prompt, please use the native /generate API."
505
509
  )
506
510
 
507
- prompts.append(request.prompt)
511
+ prompt = request.prompt
512
+ if is_completion_template_defined():
513
+ prompt = generate_completion_prompt_from_request(request)
514
+ prompts.append(prompt)
515
+
508
516
  lora_paths.append(request.lora_path)
509
517
  if request.echo and request.logprobs:
510
518
  current_logprob_start_len = 0
@@ -569,7 +577,9 @@ def v1_generate_request(
569
577
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
570
578
 
571
579
 
572
- def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
580
+ def v1_generate_response(
581
+ request, ret, tokenizer_manager, created, to_file=False, cache_report=False
582
+ ):
573
583
  choices = []
574
584
  echo = False
575
585
 
@@ -635,7 +645,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
635
645
  "index": 0,
636
646
  "text": text,
637
647
  "logprobs": logprobs,
638
- "finish_reason": (finish_reason["type"] if finish_reason else ""),
648
+ "finish_reason": finish_reason["type"] if finish_reason else None,
639
649
  "matched_stop": (
640
650
  finish_reason["matched"]
641
651
  if finish_reason and "matched" in finish_reason
@@ -647,7 +657,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
647
657
  index=idx,
648
658
  text=text,
649
659
  logprobs=logprobs,
650
- finish_reason=(finish_reason["type"] if finish_reason else ""),
660
+ finish_reason=finish_reason["type"] if finish_reason else None,
651
661
  matched_stop=(
652
662
  finish_reason["matched"]
653
663
  if finish_reason and "matched" in finish_reason
@@ -667,7 +677,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
667
677
  # remain the same but if needed we can change that
668
678
  "id": ret[i]["meta_info"]["id"],
669
679
  "object": "text_completion",
670
- "created": int(time.time()),
680
+ "created": created,
671
681
  "model": request[i].model,
672
682
  "choices": choice,
673
683
  "usage": {
@@ -686,14 +696,19 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
686
696
  ret[i]["meta_info"]["prompt_tokens"] for i in range(0, len(ret), request.n)
687
697
  )
688
698
  completion_tokens = sum(item["meta_info"]["completion_tokens"] for item in ret)
699
+ cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
689
700
  response = CompletionResponse(
690
701
  id=ret[0]["meta_info"]["id"],
691
702
  model=request.model,
703
+ created=created,
692
704
  choices=choices,
693
705
  usage=UsageInfo(
694
706
  prompt_tokens=prompt_tokens,
695
707
  completion_tokens=completion_tokens,
696
708
  total_tokens=prompt_tokens + completion_tokens,
709
+ prompt_tokens_details=(
710
+ {"cached_tokens": cached_tokens} if cache_report else None
711
+ ),
697
712
  ),
698
713
  )
699
714
  return response
@@ -702,6 +717,7 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
702
717
  async def v1_completions(tokenizer_manager, raw_request: Request):
703
718
  request_json = await raw_request.json()
704
719
  all_requests = [CompletionRequest(**request_json)]
720
+ created = int(time.time())
705
721
  adapted_request, request = v1_generate_request(all_requests)
706
722
 
707
723
  if adapted_request.stream:
@@ -711,6 +727,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
711
727
  n_prev_tokens = {}
712
728
  prompt_tokens = {}
713
729
  completion_tokens = {}
730
+ cached_tokens = {}
731
+
714
732
  try:
715
733
  async for content in tokenizer_manager.generate_request(
716
734
  adapted_request, raw_request
@@ -723,6 +741,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
723
741
  text = content["text"]
724
742
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
725
743
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
744
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
726
745
 
727
746
  if not stream_buffer: # The first chunk
728
747
  if request.echo:
@@ -786,7 +805,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
786
805
  index=index,
787
806
  text=delta,
788
807
  logprobs=logprobs,
789
- finish_reason=(finish_reason["type"] if finish_reason else ""),
808
+ finish_reason=finish_reason["type"] if finish_reason else None,
790
809
  matched_stop=(
791
810
  finish_reason["matched"]
792
811
  if finish_reason and "matched" in finish_reason
@@ -795,6 +814,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
795
814
  )
796
815
  chunk = CompletionStreamResponse(
797
816
  id=content["meta_info"]["id"],
817
+ created=created,
798
818
  object="text_completion",
799
819
  choices=[choice_data],
800
820
  model=request.model,
@@ -813,14 +833,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
813
833
  total_completion_tokens = sum(
814
834
  tokens for tokens in completion_tokens.values()
815
835
  )
836
+ cache_report = tokenizer_manager.server_args.enable_cache_report
837
+ if cache_report:
838
+ cached_tokens_sum = sum(
839
+ tokens for tokens in cached_tokens.values()
840
+ )
841
+ prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
842
+ else:
843
+ prompt_tokens_details = None
816
844
  usage = UsageInfo(
817
845
  prompt_tokens=total_prompt_tokens,
818
846
  completion_tokens=total_completion_tokens,
819
847
  total_tokens=total_prompt_tokens + total_completion_tokens,
848
+ prompt_tokens_details=prompt_tokens_details,
820
849
  )
821
850
 
822
851
  final_usage_chunk = CompletionStreamResponse(
823
852
  id=content["meta_info"]["id"],
853
+ created=created,
824
854
  choices=[],
825
855
  model=request.model,
826
856
  usage=usage,
@@ -851,7 +881,13 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
851
881
  if not isinstance(ret, list):
852
882
  ret = [ret]
853
883
 
854
- response = v1_generate_response(request, ret, tokenizer_manager)
884
+ response = v1_generate_response(
885
+ request,
886
+ ret,
887
+ tokenizer_manager,
888
+ created,
889
+ cache_report=tokenizer_manager.server_args.enable_cache_report,
890
+ )
855
891
  return response
856
892
 
857
893
 
@@ -863,6 +899,7 @@ def v1_chat_generate_request(
863
899
  input_ids = []
864
900
  sampling_params_list = []
865
901
  image_data_list = []
902
+ audio_data_list = []
866
903
  return_logprobs = []
867
904
  logprob_start_lens = []
868
905
  top_logprobs_nums = []
@@ -876,7 +913,9 @@ def v1_chat_generate_request(
876
913
  # - prompt: The full prompt string.
877
914
  # - stop: Custom stop tokens.
878
915
  # - image_data: None or a list of image strings (URLs or base64 strings).
916
+ # - audio_data: None or a list of audio strings (URLs).
879
917
  # None skips any image processing in GenerateReqInput.
918
+ strict_tag = None
880
919
  if not isinstance(request.messages, str):
881
920
  # Apply chat template and its stop strings.
882
921
  tools = None
@@ -891,6 +930,10 @@ def v1_chat_generate_request(
891
930
  else:
892
931
  tools = [item.function.model_dump() for item in request.tools]
893
932
 
933
+ tool_call_parser = tokenizer_manager.server_args.tool_call_parser
934
+ parser = FunctionCallParser(request.tools, tool_call_parser)
935
+ strict_tag = parser.get_structure_tag()
936
+
894
937
  if chat_template_name is None:
895
938
  openai_compatible_messages = []
896
939
  for message in request.messages:
@@ -920,7 +963,7 @@ def v1_chat_generate_request(
920
963
  )
921
964
  except:
922
965
  # This except branch will be triggered when the chosen model
923
- # has a different tools input format that is not compatiable
966
+ # has a different tools input format that is not compatible
924
967
  # with openAI's apply_chat_template tool_call format, like Mistral.
925
968
  tools = [t if "function" in t else {"function": t} for t in tools]
926
969
  prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
@@ -940,11 +983,13 @@ def v1_chat_generate_request(
940
983
  prompt_ids += encoded
941
984
  stop = request.stop
942
985
  image_data = None
986
+ audio_data = None
943
987
  modalities = []
944
988
  else:
945
989
  conv = generate_chat_conv(request, chat_template_name)
946
990
  prompt = conv.get_prompt()
947
991
  image_data = conv.image_data
992
+ audio_data = conv.audio_data
948
993
  modalities = conv.modalities
949
994
  stop = conv.stop_str or []
950
995
  if request.stop:
@@ -958,6 +1003,7 @@ def v1_chat_generate_request(
958
1003
  prompt_ids = request.messages
959
1004
  stop = request.stop
960
1005
  image_data = None
1006
+ audio_data = None
961
1007
  modalities = []
962
1008
  input_ids.append(prompt_ids)
963
1009
  return_logprobs.append(request.logprobs)
@@ -995,9 +1041,26 @@ def v1_chat_generate_request(
995
1041
  sampling_params["structural_tag"] = convert_json_schema_to_str(
996
1042
  request.response_format.model_dump(by_alias=True)
997
1043
  )
1044
+
1045
+ if strict_tag is not None:
1046
+ if (
1047
+ sampling_params.get("regex")
1048
+ or sampling_params.get("ebnf")
1049
+ or sampling_params.get("structural_tag")
1050
+ or sampling_params.get("json_schema")
1051
+ ):
1052
+ logger.warning(
1053
+ "Constrained decoding is not compatible with tool calls."
1054
+ )
1055
+ else:
1056
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
1057
+ strict_tag.model_dump(by_alias=True)
1058
+ )
1059
+
998
1060
  sampling_params_list.append(sampling_params)
999
1061
 
1000
1062
  image_data_list.append(image_data)
1063
+ audio_data_list.append(audio_data)
1001
1064
  modalities_list.append(modalities)
1002
1065
  if len(all_requests) == 1:
1003
1066
  if isinstance(input_ids[0], str):
@@ -1006,6 +1069,7 @@ def v1_chat_generate_request(
1006
1069
  prompt_kwargs = {"input_ids": input_ids[0]}
1007
1070
  sampling_params_list = sampling_params_list[0]
1008
1071
  image_data_list = image_data_list[0]
1072
+ audio_data_list = audio_data_list[0]
1009
1073
  return_logprobs = return_logprobs[0]
1010
1074
  logprob_start_lens = logprob_start_lens[0]
1011
1075
  top_logprobs_nums = top_logprobs_nums[0]
@@ -1020,6 +1084,7 @@ def v1_chat_generate_request(
1020
1084
  adapted_request = GenerateReqInput(
1021
1085
  **prompt_kwargs,
1022
1086
  image_data=image_data_list,
1087
+ audio_data=audio_data_list,
1023
1088
  sampling_params=sampling_params_list,
1024
1089
  return_logprob=return_logprobs,
1025
1090
  logprob_start_len=logprob_start_lens,
@@ -1037,6 +1102,7 @@ def v1_chat_generate_request(
1037
1102
  def v1_chat_generate_response(
1038
1103
  request,
1039
1104
  ret,
1105
+ created,
1040
1106
  to_file=False,
1041
1107
  cache_report=False,
1042
1108
  tool_call_parser=None,
@@ -1053,7 +1119,9 @@ def v1_chat_generate_response(
1053
1119
  if logprobs:
1054
1120
  logprobs = to_openai_style_logprobs(
1055
1121
  output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
1056
- output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
1122
+ output_top_logprobs=ret_item["meta_info"].get(
1123
+ "output_top_logprobs", None
1124
+ ),
1057
1125
  )
1058
1126
  token_logprobs = []
1059
1127
  for token_idx, (token, logprob) in enumerate(
@@ -1122,7 +1190,7 @@ def v1_chat_generate_response(
1122
1190
  finish_reason["type"] = "tool_calls"
1123
1191
  finish_reason["matched"] = None
1124
1192
  try:
1125
- full_normal_text, call_info_list = parser.parse_non_stream(text)
1193
+ text, call_info_list = parser.parse_non_stream(text)
1126
1194
  tool_calls = [
1127
1195
  ToolCall(
1128
1196
  id=str(call_info.tool_index),
@@ -1145,12 +1213,12 @@ def v1_chat_generate_response(
1145
1213
  "index": 0,
1146
1214
  "message": {
1147
1215
  "role": "assistant",
1148
- "content": text if tool_calls is None else None,
1216
+ "content": text if text else None,
1149
1217
  "tool_calls": tool_calls,
1150
- "reasoning_content": reasoning_text,
1218
+ "reasoning_content": reasoning_text if reasoning_text else None,
1151
1219
  },
1152
1220
  "logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
1153
- "finish_reason": (finish_reason["type"] if finish_reason else ""),
1221
+ "finish_reason": finish_reason["type"] if finish_reason else None,
1154
1222
  "matched_stop": (
1155
1223
  finish_reason["matched"]
1156
1224
  if finish_reason and "matched" in finish_reason
@@ -1162,12 +1230,12 @@ def v1_chat_generate_response(
1162
1230
  index=idx,
1163
1231
  message=ChatMessage(
1164
1232
  role="assistant",
1165
- content=text if tool_calls is None else None,
1233
+ content=text if text else None,
1166
1234
  tool_calls=tool_calls,
1167
- reasoning_content=reasoning_text,
1235
+ reasoning_content=reasoning_text if reasoning_text else None,
1168
1236
  ),
1169
1237
  logprobs=choice_logprobs,
1170
- finish_reason=(finish_reason["type"] if finish_reason else ""),
1238
+ finish_reason=finish_reason["type"] if finish_reason else None,
1171
1239
  matched_stop=(
1172
1240
  finish_reason["matched"]
1173
1241
  if finish_reason and "matched" in finish_reason
@@ -1188,7 +1256,7 @@ def v1_chat_generate_response(
1188
1256
  # remain the same but if needed we can change that
1189
1257
  "id": ret[i]["meta_info"]["id"],
1190
1258
  "object": "chat.completion",
1191
- "created": int(time.time()),
1259
+ "created": created,
1192
1260
  "model": request[i].model,
1193
1261
  "choices": choice,
1194
1262
  "usage": {
@@ -1210,6 +1278,7 @@ def v1_chat_generate_response(
1210
1278
  cached_tokens = sum(item["meta_info"].get("cached_tokens", 0) for item in ret)
1211
1279
  response = ChatCompletionResponse(
1212
1280
  id=ret[0]["meta_info"]["id"],
1281
+ created=created,
1213
1282
  model=request.model,
1214
1283
  choices=choices,
1215
1284
  usage=UsageInfo(
@@ -1224,9 +1293,12 @@ def v1_chat_generate_response(
1224
1293
  return response
1225
1294
 
1226
1295
 
1227
- async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1296
+ async def v1_chat_completions(
1297
+ tokenizer_manager, raw_request: Request, cache_report=False
1298
+ ):
1228
1299
  request_json = await raw_request.json()
1229
1300
  all_requests = [ChatCompletionRequest(**request_json)]
1301
+ created = int(time.time())
1230
1302
  adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
1231
1303
 
1232
1304
  if adapted_request.stream:
@@ -1239,6 +1311,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1239
1311
  n_prev_tokens = {}
1240
1312
  prompt_tokens = {}
1241
1313
  completion_tokens = {}
1314
+ cached_tokens = {}
1242
1315
  try:
1243
1316
  async for content in tokenizer_manager.generate_request(
1244
1317
  adapted_request, raw_request
@@ -1252,14 +1325,15 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1252
1325
 
1253
1326
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
1254
1327
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
1328
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
1255
1329
  if request.logprobs:
1256
1330
  logprobs = to_openai_style_logprobs(
1257
1331
  output_token_logprobs=content["meta_info"][
1258
1332
  "output_token_logprobs"
1259
1333
  ][n_prev_token:],
1260
- output_top_logprobs=content["meta_info"][
1261
- "output_top_logprobs"
1262
- ][n_prev_token:],
1334
+ output_top_logprobs=content["meta_info"].get(
1335
+ "output_top_logprobs", []
1336
+ )[n_prev_token:],
1263
1337
  )
1264
1338
 
1265
1339
  n_prev_token = len(
@@ -1305,21 +1379,11 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1305
1379
  if is_first:
1306
1380
  # First chunk with role
1307
1381
  is_first = False
1308
- if (
1309
- tokenizer_manager.server_args.reasoning_parser
1310
- and request.separate_reasoning
1311
- ):
1312
- delta = DeltaMessage(role="assistant", reasoning_content="")
1313
- else:
1314
- delta = DeltaMessage(role="assistant", content="")
1382
+ delta = DeltaMessage(role="assistant")
1315
1383
  choice_data = ChatCompletionResponseStreamChoice(
1316
1384
  index=index,
1317
1385
  delta=delta,
1318
- finish_reason=(
1319
- None
1320
- if finish_reason_type and len(finish_reason_type) == 0
1321
- else finish_reason_type
1322
- ),
1386
+ finish_reason=finish_reason_type,
1323
1387
  matched_stop=(
1324
1388
  finish_reason["matched"]
1325
1389
  if finish_reason and "matched" in finish_reason
@@ -1329,6 +1393,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1329
1393
  )
1330
1394
  chunk = ChatCompletionStreamResponse(
1331
1395
  id=content["meta_info"]["id"],
1396
+ created=created,
1332
1397
  choices=[choice_data],
1333
1398
  model=request.model,
1334
1399
  )
@@ -1354,16 +1419,16 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1354
1419
  if reasoning_text:
1355
1420
  choice_data = ChatCompletionResponseStreamChoice(
1356
1421
  index=index,
1357
- delta=DeltaMessage(reasoning_content=reasoning_text),
1358
- finish_reason=(
1359
- None
1360
- if finish_reason_type
1361
- and len(finish_reason_type) == 0
1362
- else finish_reason_type
1422
+ delta=DeltaMessage(
1423
+ reasoning_content=(
1424
+ reasoning_text if reasoning_text else None
1425
+ )
1363
1426
  ),
1427
+ finish_reason=finish_reason_type,
1364
1428
  )
1365
1429
  chunk = ChatCompletionStreamResponse(
1366
1430
  id=content["meta_info"]["id"],
1431
+ created=created,
1367
1432
  choices=[choice_data],
1368
1433
  model=request.model,
1369
1434
  )
@@ -1388,16 +1453,14 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1388
1453
  if normal_text:
1389
1454
  choice_data = ChatCompletionResponseStreamChoice(
1390
1455
  index=index,
1391
- delta=DeltaMessage(content=normal_text),
1392
- finish_reason=(
1393
- None
1394
- if finish_reason_type
1395
- and len(finish_reason_type) == 0
1396
- else finish_reason_type
1456
+ delta=DeltaMessage(
1457
+ content=normal_text if normal_text else None
1397
1458
  ),
1459
+ finish_reason=finish_reason_type,
1398
1460
  )
1399
1461
  chunk = ChatCompletionStreamResponse(
1400
1462
  id=content["meta_info"]["id"],
1463
+ created=created,
1401
1464
  choices=[choice_data],
1402
1465
  model=request.model,
1403
1466
  )
@@ -1407,11 +1470,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1407
1470
  for call_item in calls:
1408
1471
  # transform call_item -> FunctionResponse + ToolCall
1409
1472
 
1410
- if (
1411
- content["meta_info"]["finish_reason"]
1412
- and content["meta_info"]["finish_reason"]["type"]
1413
- == "stop"
1414
- ):
1473
+ if finish_reason_type == "stop":
1415
1474
  latest_delta_len = 0
1416
1475
  if isinstance(call_item.parameters, str):
1417
1476
  latest_delta_len = len(call_item.parameters)
@@ -1432,6 +1491,8 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1432
1491
  )
1433
1492
  call_item.parameters = remaining_call
1434
1493
 
1494
+ finish_reason_type = "tool_calls"
1495
+
1435
1496
  tool_call = ToolCall(
1436
1497
  id=str(call_item.tool_index),
1437
1498
  function=FunctionResponse(
@@ -1441,13 +1502,17 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1441
1502
  )
1442
1503
  choice_data = ChatCompletionResponseStreamChoice(
1443
1504
  index=index,
1444
- delta=DeltaMessage(
1445
- role="assistant", tool_calls=[tool_call]
1446
- ),
1447
- finish_reason="tool_call",
1505
+ delta=DeltaMessage(tool_calls=[tool_call]),
1506
+ finish_reason=(
1507
+ None
1508
+ if request.stream_options
1509
+ and request.stream_options.include_usage
1510
+ else finish_reason_type
1511
+ ), # additional chunk will be return
1448
1512
  )
1449
1513
  chunk = ChatCompletionStreamResponse(
1450
1514
  id=content["meta_info"]["id"],
1515
+ created=created,
1451
1516
  choices=[choice_data],
1452
1517
  model=request.model,
1453
1518
  )
@@ -1458,29 +1523,44 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1458
1523
 
1459
1524
  else:
1460
1525
  # No tool calls => just treat this as normal text
1461
- choice_data = ChatCompletionResponseStreamChoice(
1462
- index=index,
1463
- delta=DeltaMessage(content=delta),
1464
- finish_reason=(
1465
- None
1466
- if finish_reason_type and len(finish_reason_type) == 0
1467
- else finish_reason_type
1468
- ),
1469
- matched_stop=(
1470
- finish_reason["matched"]
1471
- if finish_reason and "matched" in finish_reason
1472
- else None
1473
- ),
1474
- logprobs=choice_logprobs,
1475
- )
1476
- chunk = ChatCompletionStreamResponse(
1477
- id=content["meta_info"]["id"],
1478
- choices=[choice_data],
1479
- model=request.model,
1480
- )
1481
- yield f"data: {chunk.model_dump_json()}\n\n"
1482
- stream_buffers[index] = new_stream_buffer
1483
- is_firsts[index] = is_first
1526
+ if delta or not (
1527
+ request.stream_options
1528
+ and request.stream_options.include_usage
1529
+ ):
1530
+ choice_data = ChatCompletionResponseStreamChoice(
1531
+ index=index,
1532
+ delta=DeltaMessage(content=delta if delta else None),
1533
+ finish_reason=(
1534
+ None
1535
+ if request.stream_options
1536
+ and request.stream_options.include_usage
1537
+ else finish_reason_type
1538
+ ),
1539
+ matched_stop=(
1540
+ finish_reason["matched"]
1541
+ if finish_reason and "matched" in finish_reason
1542
+ else None
1543
+ ),
1544
+ logprobs=choice_logprobs,
1545
+ )
1546
+ chunk = ChatCompletionStreamResponse(
1547
+ id=content["meta_info"]["id"],
1548
+ created=created,
1549
+ choices=[choice_data],
1550
+ model=request.model,
1551
+ )
1552
+ yield f"data: {chunk.model_dump_json()}\n\n"
1553
+ stream_buffers[index] = new_stream_buffer
1554
+ is_firsts[index] = is_first
1555
+ if finish_reason_type == "stop" and request.tool_choice != "none":
1556
+ parser = FunctionCallParser(
1557
+ tools=request.tools,
1558
+ tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
1559
+ )
1560
+ if parser.has_tool_call(new_stream_buffer):
1561
+ # if the stream ends with empty string after tool calls
1562
+ finish_reason_type = "tool_calls"
1563
+
1484
1564
  if request.stream_options and request.stream_options.include_usage:
1485
1565
  total_prompt_tokens = sum(
1486
1566
  tokens
@@ -1490,22 +1570,37 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1490
1570
  total_completion_tokens = sum(
1491
1571
  tokens for tokens in completion_tokens.values()
1492
1572
  )
1573
+ cache_report = tokenizer_manager.server_args.enable_cache_report
1574
+ if cache_report:
1575
+ cached_tokens_sum = sum(
1576
+ tokens for tokens in cached_tokens.values()
1577
+ )
1578
+ prompt_tokens_details = {"cached_tokens": cached_tokens_sum}
1579
+ else:
1580
+ prompt_tokens_details = None
1493
1581
  usage = UsageInfo(
1494
1582
  prompt_tokens=total_prompt_tokens,
1495
1583
  completion_tokens=total_completion_tokens,
1496
1584
  total_tokens=total_prompt_tokens + total_completion_tokens,
1585
+ prompt_tokens_details=prompt_tokens_details,
1497
1586
  )
1498
1587
 
1499
- final_usage_chunk = ChatCompletionStreamResponse(
1500
- id=content["meta_info"]["id"],
1501
- choices=[],
1502
- model=request.model,
1503
- usage=usage,
1504
- )
1505
- final_usage_data = final_usage_chunk.model_dump_json(
1506
- exclude_none=True
1507
- )
1508
- yield f"data: {final_usage_data}\n\n"
1588
+ else:
1589
+ usage = None
1590
+ final_usage_chunk = ChatCompletionStreamResponse(
1591
+ id=content["meta_info"]["id"],
1592
+ created=created,
1593
+ choices=[
1594
+ ChatCompletionResponseStreamChoice(
1595
+ index=index,
1596
+ delta=DeltaMessage(),
1597
+ finish_reason=finish_reason_type,
1598
+ )
1599
+ ],
1600
+ model=request.model,
1601
+ usage=usage,
1602
+ )
1603
+ yield f"data: {final_usage_chunk.model_dump_json()}\n\n"
1509
1604
  except ValueError as e:
1510
1605
  error = create_streaming_error_response(str(e))
1511
1606
  yield f"data: {error}\n\n"
@@ -1530,6 +1625,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1530
1625
  response = v1_chat_generate_response(
1531
1626
  request,
1532
1627
  ret,
1628
+ created,
1533
1629
  cache_report=tokenizer_manager.server_args.enable_cache_report,
1534
1630
  tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
1535
1631
  reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
@@ -1557,18 +1653,19 @@ def v1_embedding_request(all_requests, tokenizer_manager):
1557
1653
  elif isinstance(prompt, list) and isinstance(
1558
1654
  prompt[0], MultimodalEmbeddingInput
1559
1655
  ):
1560
- assert (
1561
- chat_template_name is not None
1562
- ), "chat_template_name is required for multimodal inputs"
1563
1656
  texts = []
1564
1657
  images = []
1565
1658
  for item in prompt:
1566
- texts.append(item.text if item.text is not None else None)
1659
+ # TODO simply use padding for text, we should use a better way to handle this
1660
+ texts.append(item.text if item.text is not None else "padding")
1567
1661
  images.append(item.image if item.image is not None else None)
1568
- convs = generate_embedding_convs(texts, images, chat_template_name)
1569
1662
  generate_prompts = []
1570
- for conv in convs:
1571
- generate_prompts.append(conv.get_prompt())
1663
+ if chat_template_name is not None:
1664
+ convs = generate_embedding_convs(texts, images, chat_template_name)
1665
+ for conv in convs:
1666
+ generate_prompts.append(conv.get_prompt())
1667
+ else:
1668
+ generate_prompts = texts
1572
1669
  if len(generate_prompts) == 1:
1573
1670
  prompt_kwargs = {"text": generate_prompts[0], "image_data": images[0]}
1574
1671
  else: