sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. sglang/bench_serving.py +72 -10
  2. sglang/srt/_custom_ops.py +59 -92
  3. sglang/srt/configs/deepseekvl2.py +10 -1
  4. sglang/srt/configs/model_config.py +6 -16
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/custom_op.py +5 -0
  7. sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
  8. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  9. sglang/srt/distributed/parallel_state.py +32 -5
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/entrypoints/http_server.py +7 -1
  12. sglang/srt/entrypoints/verl_engine.py +2 -0
  13. sglang/srt/function_call_parser.py +0 -1
  14. sglang/srt/layers/attention/flashattention_backend.py +582 -125
  15. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  17. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  18. sglang/srt/layers/dp_attention.py +12 -1
  19. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  20. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  21. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
  26. sglang/srt/layers/moe/topk.py +79 -6
  27. sglang/srt/layers/quantization/__init__.py +137 -165
  28. sglang/srt/layers/quantization/awq.py +200 -0
  29. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  30. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  31. sglang/srt/layers/quantization/fp8_kernel.py +2 -1
  32. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  33. sglang/srt/layers/quantization/gptq.py +30 -40
  34. sglang/srt/layers/quantization/moe_wna16.py +501 -0
  35. sglang/srt/layers/quantization/utils.py +1 -1
  36. sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
  37. sglang/srt/lora/backend/base_backend.py +4 -4
  38. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  39. sglang/srt/lora/backend/triton_backend.py +5 -8
  40. sglang/srt/lora/layers.py +19 -33
  41. sglang/srt/lora/lora_manager.py +20 -7
  42. sglang/srt/lora/mem_pool.py +12 -6
  43. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  44. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  45. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  46. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  47. sglang/srt/lora/utils.py +6 -0
  48. sglang/srt/managers/cache_controller.py +34 -11
  49. sglang/srt/managers/io_struct.py +4 -2
  50. sglang/srt/managers/mm_utils.py +202 -156
  51. sglang/srt/managers/multimodal_processor.py +0 -2
  52. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  53. sglang/srt/managers/multimodal_processors/clip.py +44 -0
  54. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  55. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  56. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  57. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  58. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  59. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  60. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  61. sglang/srt/managers/schedule_batch.py +185 -127
  62. sglang/srt/managers/scheduler.py +29 -23
  63. sglang/srt/managers/tokenizer_manager.py +1 -2
  64. sglang/srt/managers/tp_worker.py +3 -0
  65. sglang/srt/managers/utils.py +1 -6
  66. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  67. sglang/srt/mem_cache/memory_pool.py +72 -6
  68. sglang/srt/mem_cache/paged_allocator.py +39 -0
  69. sglang/srt/metrics/collector.py +23 -53
  70. sglang/srt/model_executor/cuda_graph_runner.py +16 -13
  71. sglang/srt/model_executor/forward_batch_info.py +10 -10
  72. sglang/srt/model_executor/model_runner.py +64 -59
  73. sglang/srt/model_loader/loader.py +19 -1
  74. sglang/srt/model_loader/weight_utils.py +6 -3
  75. sglang/srt/models/clip.py +568 -0
  76. sglang/srt/models/deepseek_janus_pro.py +12 -17
  77. sglang/srt/models/deepseek_v2.py +339 -123
  78. sglang/srt/models/deepseek_vl2.py +105 -104
  79. sglang/srt/models/gemma3_causal.py +12 -2
  80. sglang/srt/models/gemma3_mm.py +20 -80
  81. sglang/srt/models/llama.py +4 -1
  82. sglang/srt/models/llava.py +31 -19
  83. sglang/srt/models/llavavid.py +16 -7
  84. sglang/srt/models/minicpmo.py +63 -147
  85. sglang/srt/models/minicpmv.py +17 -27
  86. sglang/srt/models/mllama.py +29 -14
  87. sglang/srt/models/qwen2.py +9 -6
  88. sglang/srt/models/qwen2_5_vl.py +21 -31
  89. sglang/srt/models/qwen2_vl.py +20 -21
  90. sglang/srt/openai_api/adapter.py +106 -93
  91. sglang/srt/openai_api/protocol.py +10 -5
  92. sglang/srt/patch_torch.py +71 -0
  93. sglang/srt/platforms/interface.py +371 -0
  94. sglang/srt/server_args.py +120 -25
  95. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  96. sglang/srt/speculative/eagle_utils.py +140 -28
  97. sglang/srt/speculative/eagle_worker.py +94 -25
  98. sglang/srt/utils.py +137 -51
  99. sglang/test/runners.py +27 -2
  100. sglang/test/test_custom_ops.py +55 -0
  101. sglang/test/test_utils.py +14 -27
  102. sglang/utils.py +2 -2
  103. sglang/version.py +1 -1
  104. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
  105. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
  106. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
  107. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
  108. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
@@ -45,7 +45,7 @@ from sglang.srt.managers.mm_utils import (
45
45
  MultiModalityDataPaddingPatternTokenPairs,
46
46
  general_mm_embed_routine,
47
47
  )
48
- from sglang.srt.managers.schedule_batch import MultimodalInputs
48
+ from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
49
49
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
50
50
  from sglang.srt.model_loader.weight_utils import default_weight_loader
51
51
  from sglang.srt.models.qwen2 import Qwen2Model
@@ -472,18 +472,24 @@ class Qwen2VLForConditionalGeneration(nn.Module):
472
472
 
473
473
  # Use grid_t * grid_w * grid_h to pad tokens for each image
474
474
  # add replaced padding by unique image hash
475
- def pad_input_ids(self, input_ids: List[int], multi_modal_inputs: MultimodalInputs):
475
+ def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
476
476
  # Get all special token IDs
477
- im_start_id: int = multi_modal_inputs.im_start_id
478
- im_end_id: int = multi_modal_inputs.im_end_id
477
+ im_start_id: int = mm_inputs.im_start_id
478
+ im_end_id: int = mm_inputs.im_end_id
479
479
 
480
480
  media_token_pairs = [(im_start_id, im_end_id)]
481
481
  pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
482
- return pattern.pad_input_tokens(input_ids, multi_modal_inputs)
482
+ return pattern.pad_input_tokens(input_ids, mm_inputs)
483
483
 
484
- def get_image_feature(self, image_input: MultimodalInputs) -> torch.Tensor:
485
- pixel_values = image_input.pixel_values.type(self.visual.dtype)
486
- image_embeds = self.visual(pixel_values, grid_thw=image_input.image_grid_thws)
484
+ def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
485
+ # in qwen-vl, last dim is the same
486
+ pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type(
487
+ self.visual.dtype
488
+ )
489
+ image_grid_thws = torch.concat([item.image_grid_thws for item in items], dim=0)
490
+ assert pixel_values.dim() == 2, pixel_values.dim()
491
+ assert image_grid_thws.dim() == 2, image_grid_thws.dim()
492
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thws)
487
493
  return image_embeds
488
494
 
489
495
  def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
@@ -527,27 +533,20 @@ class Qwen2VLForConditionalGeneration(nn.Module):
527
533
  "multimodal section rotary embedding requires "
528
534
  f"(3, seq_len) positions, but got {positions.size()}"
529
535
  )
530
-
531
- inputs_embeds = general_mm_embed_routine(
536
+ hidden_states = general_mm_embed_routine(
532
537
  input_ids=input_ids,
533
538
  forward_batch=forward_batch,
534
- embed_tokens=self.get_input_embeddings(),
535
- mm_data_embedding_func=self.get_image_feature,
536
- )
537
-
538
- hidden_states = self.model(
539
- input_ids=None,
539
+ language_model=self.model,
540
+ image_data_embedding_func=self.get_image_feature,
540
541
  positions=positions,
541
- forward_batch=forward_batch,
542
- input_embeds=inputs_embeds,
543
542
  )
544
543
 
545
- if not get_embedding:
544
+ if get_embedding:
545
+ return self.pooler(hidden_states, forward_batch)
546
+ else:
546
547
  return self.logits_processor(
547
548
  input_ids, hidden_states, self.lm_head, forward_batch
548
549
  )
549
- else:
550
- return self.pooler(hidden_states, forward_batch)
551
550
 
552
551
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
553
552
  stacked_params_mapping = [
@@ -20,7 +20,7 @@ import os
20
20
  import time
21
21
  import uuid
22
22
  from http import HTTPStatus
23
- from typing import Any, Dict, List, Set
23
+ from typing import Dict, List
24
24
 
25
25
  from fastapi import HTTPException, Request, UploadFile
26
26
  from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -645,7 +645,7 @@ def v1_generate_response(
645
645
  "index": 0,
646
646
  "text": text,
647
647
  "logprobs": logprobs,
648
- "finish_reason": (finish_reason["type"] if finish_reason else ""),
648
+ "finish_reason": finish_reason["type"] if finish_reason else None,
649
649
  "matched_stop": (
650
650
  finish_reason["matched"]
651
651
  if finish_reason and "matched" in finish_reason
@@ -657,7 +657,7 @@ def v1_generate_response(
657
657
  index=idx,
658
658
  text=text,
659
659
  logprobs=logprobs,
660
- finish_reason=(finish_reason["type"] if finish_reason else ""),
660
+ finish_reason=finish_reason["type"] if finish_reason else None,
661
661
  matched_stop=(
662
662
  finish_reason["matched"]
663
663
  if finish_reason and "matched" in finish_reason
@@ -805,7 +805,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
805
805
  index=index,
806
806
  text=delta,
807
807
  logprobs=logprobs,
808
- finish_reason=(finish_reason["type"] if finish_reason else ""),
808
+ finish_reason=finish_reason["type"] if finish_reason else None,
809
809
  matched_stop=(
810
810
  finish_reason["matched"]
811
811
  if finish_reason and "matched" in finish_reason
@@ -897,6 +897,7 @@ def v1_chat_generate_request(
897
897
  request_ids: List[str] = None,
898
898
  ):
899
899
  input_ids = []
900
+ prompts = []
900
901
  sampling_params_list = []
901
902
  image_data_list = []
902
903
  audio_data_list = []
@@ -916,6 +917,7 @@ def v1_chat_generate_request(
916
917
  # - audio_data: None or a list of audio strings (URLs).
917
918
  # None skips any image processing in GenerateReqInput.
918
919
  strict_tag = None
920
+ prompt = ""
919
921
  if not isinstance(request.messages, str):
920
922
  # Apply chat template and its stop strings.
921
923
  tools = None
@@ -1005,11 +1007,13 @@ def v1_chat_generate_request(
1005
1007
  image_data = None
1006
1008
  audio_data = None
1007
1009
  modalities = []
1010
+ prompt = request.messages
1008
1011
  input_ids.append(prompt_ids)
1009
1012
  return_logprobs.append(request.logprobs)
1010
1013
  logprob_start_lens.append(-1)
1011
1014
  top_logprobs_nums.append(request.top_logprobs or 0)
1012
1015
  lora_paths.append(request.lora_path)
1016
+ prompts.append(prompt)
1013
1017
 
1014
1018
  sampling_params = {
1015
1019
  "temperature": request.temperature,
@@ -1063,10 +1067,14 @@ def v1_chat_generate_request(
1063
1067
  audio_data_list.append(audio_data)
1064
1068
  modalities_list.append(modalities)
1065
1069
  if len(all_requests) == 1:
1066
- if isinstance(input_ids[0], str):
1067
- prompt_kwargs = {"text": input_ids[0]}
1070
+ if tokenizer_manager.model_config.is_multimodal:
1071
+ # processor will need text input
1072
+ prompt_kwargs = {"text": prompts[0]}
1068
1073
  else:
1069
- prompt_kwargs = {"input_ids": input_ids[0]}
1074
+ if isinstance(input_ids[0], str):
1075
+ prompt_kwargs = {"text": input_ids[0]}
1076
+ else:
1077
+ prompt_kwargs = {"input_ids": input_ids[0]}
1070
1078
  sampling_params_list = sampling_params_list[0]
1071
1079
  image_data_list = image_data_list[0]
1072
1080
  audio_data_list = audio_data_list[0]
@@ -1076,10 +1084,14 @@ def v1_chat_generate_request(
1076
1084
  modalities_list = modalities_list[0]
1077
1085
  lora_paths = lora_paths[0]
1078
1086
  else:
1079
- if isinstance(input_ids[0], str):
1080
- prompt_kwargs = {"text": input_ids}
1087
+ if tokenizer_manager.model_config.is_multimodal:
1088
+ # processor will need text input
1089
+ prompt_kwargs = {"text": prompts}
1081
1090
  else:
1082
- prompt_kwargs = {"input_ids": input_ids}
1091
+ if isinstance(input_ids[0], str):
1092
+ prompt_kwargs = {"text": input_ids}
1093
+ else:
1094
+ prompt_kwargs = {"input_ids": input_ids}
1083
1095
 
1084
1096
  adapted_request = GenerateReqInput(
1085
1097
  **prompt_kwargs,
@@ -1119,7 +1131,9 @@ def v1_chat_generate_response(
1119
1131
  if logprobs:
1120
1132
  logprobs = to_openai_style_logprobs(
1121
1133
  output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
1122
- output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
1134
+ output_top_logprobs=ret_item["meta_info"].get(
1135
+ "output_top_logprobs", None
1136
+ ),
1123
1137
  )
1124
1138
  token_logprobs = []
1125
1139
  for token_idx, (token, logprob) in enumerate(
@@ -1216,7 +1230,7 @@ def v1_chat_generate_response(
1216
1230
  "reasoning_content": reasoning_text if reasoning_text else None,
1217
1231
  },
1218
1232
  "logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
1219
- "finish_reason": (finish_reason["type"] if finish_reason else ""),
1233
+ "finish_reason": finish_reason["type"] if finish_reason else None,
1220
1234
  "matched_stop": (
1221
1235
  finish_reason["matched"]
1222
1236
  if finish_reason and "matched" in finish_reason
@@ -1233,7 +1247,7 @@ def v1_chat_generate_response(
1233
1247
  reasoning_content=reasoning_text if reasoning_text else None,
1234
1248
  ),
1235
1249
  logprobs=choice_logprobs,
1236
- finish_reason=(finish_reason["type"] if finish_reason else ""),
1250
+ finish_reason=finish_reason["type"] if finish_reason else None,
1237
1251
  matched_stop=(
1238
1252
  finish_reason["matched"]
1239
1253
  if finish_reason and "matched" in finish_reason
@@ -1329,9 +1343,9 @@ async def v1_chat_completions(
1329
1343
  output_token_logprobs=content["meta_info"][
1330
1344
  "output_token_logprobs"
1331
1345
  ][n_prev_token:],
1332
- output_top_logprobs=content["meta_info"][
1333
- "output_top_logprobs"
1334
- ][n_prev_token:],
1346
+ output_top_logprobs=content["meta_info"].get(
1347
+ "output_top_logprobs", []
1348
+ )[n_prev_token:],
1335
1349
  )
1336
1350
 
1337
1351
  n_prev_token = len(
@@ -1377,23 +1391,11 @@ async def v1_chat_completions(
1377
1391
  if is_first:
1378
1392
  # First chunk with role
1379
1393
  is_first = False
1380
- if (
1381
- tokenizer_manager.server_args.reasoning_parser
1382
- and request.separate_reasoning
1383
- ):
1384
- delta = DeltaMessage(
1385
- role="assistant", reasoning_content=None
1386
- )
1387
- else:
1388
- delta = DeltaMessage(role="assistant", content=None)
1394
+ delta = DeltaMessage(role="assistant")
1389
1395
  choice_data = ChatCompletionResponseStreamChoice(
1390
1396
  index=index,
1391
1397
  delta=delta,
1392
- finish_reason=(
1393
- None
1394
- if finish_reason_type and len(finish_reason_type) == 0
1395
- else finish_reason_type
1396
- ),
1398
+ finish_reason=finish_reason_type,
1397
1399
  matched_stop=(
1398
1400
  finish_reason["matched"]
1399
1401
  if finish_reason and "matched" in finish_reason
@@ -1434,12 +1436,7 @@ async def v1_chat_completions(
1434
1436
  reasoning_text if reasoning_text else None
1435
1437
  )
1436
1438
  ),
1437
- finish_reason=(
1438
- None
1439
- if finish_reason_type
1440
- and len(finish_reason_type) == 0
1441
- else finish_reason_type
1442
- ),
1439
+ finish_reason=finish_reason_type,
1443
1440
  )
1444
1441
  chunk = ChatCompletionStreamResponse(
1445
1442
  id=content["meta_info"]["id"],
@@ -1471,12 +1468,7 @@ async def v1_chat_completions(
1471
1468
  delta=DeltaMessage(
1472
1469
  content=normal_text if normal_text else None
1473
1470
  ),
1474
- finish_reason=(
1475
- None
1476
- if finish_reason_type
1477
- and len(finish_reason_type) == 0
1478
- else finish_reason_type
1479
- ),
1471
+ finish_reason=finish_reason_type,
1480
1472
  )
1481
1473
  chunk = ChatCompletionStreamResponse(
1482
1474
  id=content["meta_info"]["id"],
@@ -1490,11 +1482,7 @@ async def v1_chat_completions(
1490
1482
  for call_item in calls:
1491
1483
  # transform call_item -> FunctionResponse + ToolCall
1492
1484
 
1493
- if (
1494
- content["meta_info"]["finish_reason"]
1495
- and content["meta_info"]["finish_reason"]["type"]
1496
- == "stop"
1497
- ):
1485
+ if finish_reason_type == "stop":
1498
1486
  latest_delta_len = 0
1499
1487
  if isinstance(call_item.parameters, str):
1500
1488
  latest_delta_len = len(call_item.parameters)
@@ -1515,6 +1503,8 @@ async def v1_chat_completions(
1515
1503
  )
1516
1504
  call_item.parameters = remaining_call
1517
1505
 
1506
+ finish_reason_type = "tool_calls"
1507
+
1518
1508
  tool_call = ToolCall(
1519
1509
  id=str(call_item.tool_index),
1520
1510
  function=FunctionResponse(
@@ -1524,10 +1514,13 @@ async def v1_chat_completions(
1524
1514
  )
1525
1515
  choice_data = ChatCompletionResponseStreamChoice(
1526
1516
  index=index,
1527
- delta=DeltaMessage(
1528
- role="assistant", tool_calls=[tool_call]
1529
- ),
1530
- finish_reason="tool_call",
1517
+ delta=DeltaMessage(tool_calls=[tool_call]),
1518
+ finish_reason=(
1519
+ None
1520
+ if request.stream_options
1521
+ and request.stream_options.include_usage
1522
+ else finish_reason_type
1523
+ ), # additional chunk will be return
1531
1524
  )
1532
1525
  chunk = ChatCompletionStreamResponse(
1533
1526
  id=content["meta_info"]["id"],
@@ -1542,30 +1535,44 @@ async def v1_chat_completions(
1542
1535
 
1543
1536
  else:
1544
1537
  # No tool calls => just treat this as normal text
1545
- choice_data = ChatCompletionResponseStreamChoice(
1546
- index=index,
1547
- delta=DeltaMessage(content=delta if delta else None),
1548
- finish_reason=(
1549
- None
1550
- if finish_reason_type and len(finish_reason_type) == 0
1551
- else finish_reason_type
1552
- ),
1553
- matched_stop=(
1554
- finish_reason["matched"]
1555
- if finish_reason and "matched" in finish_reason
1556
- else None
1557
- ),
1558
- logprobs=choice_logprobs,
1559
- )
1560
- chunk = ChatCompletionStreamResponse(
1561
- id=content["meta_info"]["id"],
1562
- created=created,
1563
- choices=[choice_data],
1564
- model=request.model,
1565
- )
1566
- yield f"data: {chunk.model_dump_json()}\n\n"
1567
- stream_buffers[index] = new_stream_buffer
1568
- is_firsts[index] = is_first
1538
+ if delta or not (
1539
+ request.stream_options
1540
+ and request.stream_options.include_usage
1541
+ ):
1542
+ choice_data = ChatCompletionResponseStreamChoice(
1543
+ index=index,
1544
+ delta=DeltaMessage(content=delta if delta else None),
1545
+ finish_reason=(
1546
+ None
1547
+ if request.stream_options
1548
+ and request.stream_options.include_usage
1549
+ else finish_reason_type
1550
+ ),
1551
+ matched_stop=(
1552
+ finish_reason["matched"]
1553
+ if finish_reason and "matched" in finish_reason
1554
+ else None
1555
+ ),
1556
+ logprobs=choice_logprobs,
1557
+ )
1558
+ chunk = ChatCompletionStreamResponse(
1559
+ id=content["meta_info"]["id"],
1560
+ created=created,
1561
+ choices=[choice_data],
1562
+ model=request.model,
1563
+ )
1564
+ yield f"data: {chunk.model_dump_json()}\n\n"
1565
+ stream_buffers[index] = new_stream_buffer
1566
+ is_firsts[index] = is_first
1567
+ if finish_reason_type == "stop" and request.tool_choice != "none":
1568
+ parser = FunctionCallParser(
1569
+ tools=request.tools,
1570
+ tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
1571
+ )
1572
+ if parser.has_tool_call(new_stream_buffer):
1573
+ # if the stream ends with empty string after tool calls
1574
+ finish_reason_type = "tool_calls"
1575
+
1569
1576
  if request.stream_options and request.stream_options.include_usage:
1570
1577
  total_prompt_tokens = sum(
1571
1578
  tokens
@@ -1590,17 +1597,22 @@ async def v1_chat_completions(
1590
1597
  prompt_tokens_details=prompt_tokens_details,
1591
1598
  )
1592
1599
 
1593
- final_usage_chunk = ChatCompletionStreamResponse(
1594
- id=content["meta_info"]["id"],
1595
- created=created,
1596
- choices=[],
1597
- model=request.model,
1598
- usage=usage,
1599
- )
1600
- final_usage_data = final_usage_chunk.model_dump_json(
1601
- exclude_none=True
1602
- )
1603
- yield f"data: {final_usage_data}\n\n"
1600
+ else:
1601
+ usage = None
1602
+ final_usage_chunk = ChatCompletionStreamResponse(
1603
+ id=content["meta_info"]["id"],
1604
+ created=created,
1605
+ choices=[
1606
+ ChatCompletionResponseStreamChoice(
1607
+ index=index,
1608
+ delta=DeltaMessage(),
1609
+ finish_reason=finish_reason_type,
1610
+ )
1611
+ ],
1612
+ model=request.model,
1613
+ usage=usage,
1614
+ )
1615
+ yield f"data: {final_usage_chunk.model_dump_json()}\n\n"
1604
1616
  except ValueError as e:
1605
1617
  error = create_streaming_error_response(str(e))
1606
1618
  yield f"data: {error}\n\n"
@@ -1653,18 +1665,19 @@ def v1_embedding_request(all_requests, tokenizer_manager):
1653
1665
  elif isinstance(prompt, list) and isinstance(
1654
1666
  prompt[0], MultimodalEmbeddingInput
1655
1667
  ):
1656
- assert (
1657
- chat_template_name is not None
1658
- ), "chat_template_name is required for multimodal inputs"
1659
1668
  texts = []
1660
1669
  images = []
1661
1670
  for item in prompt:
1662
- texts.append(item.text if item.text is not None else None)
1671
+ # TODO simply use padding for text, we should use a better way to handle this
1672
+ texts.append(item.text if item.text is not None else "padding")
1663
1673
  images.append(item.image if item.image is not None else None)
1664
- convs = generate_embedding_convs(texts, images, chat_template_name)
1665
1674
  generate_prompts = []
1666
- for conv in convs:
1667
- generate_prompts.append(conv.get_prompt())
1675
+ if chat_template_name is not None:
1676
+ convs = generate_embedding_convs(texts, images, chat_template_name)
1677
+ for conv in convs:
1678
+ generate_prompts.append(conv.get_prompt())
1679
+ else:
1680
+ generate_prompts = texts
1668
1681
  if len(generate_prompts) == 1:
1669
1682
  prompt_kwargs = {"text": generate_prompts[0], "image_data": images[0]}
1670
1683
  else:
@@ -28,6 +28,7 @@ class ModelCard(BaseModel):
28
28
  created: int = Field(default_factory=lambda: int(time.time()))
29
29
  owned_by: str = "sglang"
30
30
  root: Optional[str] = None
31
+ max_model_len: Optional[int] = None
31
32
 
32
33
 
33
34
  class ModelList(BaseModel):
@@ -187,7 +188,7 @@ class CompletionResponseChoice(BaseModel):
187
188
  index: int
188
189
  text: str
189
190
  logprobs: Optional[LogProbs] = None
190
- finish_reason: Optional[str] = None
191
+ finish_reason: Literal["stop", "length", "content_filter"]
191
192
  matched_stop: Union[None, int, str] = None
192
193
 
193
194
 
@@ -204,7 +205,7 @@ class CompletionResponseStreamChoice(BaseModel):
204
205
  index: int
205
206
  text: str
206
207
  logprobs: Optional[LogProbs] = None
207
- finish_reason: Optional[str] = None
208
+ finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
208
209
  matched_stop: Union[None, int, str] = None
209
210
 
210
211
 
@@ -322,7 +323,7 @@ class ChatCompletionRequest(BaseModel):
322
323
  max_tokens: Optional[int] = None
323
324
  n: int = 1
324
325
  presence_penalty: float = 0.0
325
- response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
326
+ response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
326
327
  seed: Optional[int] = None
327
328
  stop: Optional[Union[str, List[str]]] = None
328
329
  stream: bool = False
@@ -387,7 +388,9 @@ class ChatCompletionResponseChoice(BaseModel):
387
388
  index: int
388
389
  message: ChatMessage
389
390
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
390
- finish_reason: str
391
+ finish_reason: Literal[
392
+ "stop", "length", "tool_calls", "content_filter", "function_call"
393
+ ]
391
394
  matched_stop: Union[None, int, str] = None
392
395
 
393
396
 
@@ -411,7 +414,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
411
414
  index: int
412
415
  delta: DeltaMessage
413
416
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
414
- finish_reason: Optional[str] = None
417
+ finish_reason: Optional[
418
+ Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
419
+ ] = None
415
420
  matched_stop: Union[None, int, str] = None
416
421
 
417
422
 
@@ -0,0 +1,71 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ from typing import Callable, Union
15
+
16
+ import torch
17
+ from torch.multiprocessing import reductions
18
+
19
+
20
+ def monkey_patch_torch_reductions():
21
+ """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
22
+
23
+ if hasattr(reductions, "_reduce_tensor_original"):
24
+ return
25
+
26
+ reductions._reduce_tensor_original = reductions.reduce_tensor
27
+ reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
28
+
29
+ reductions.reduce_tensor = _reduce_tensor_modified
30
+ reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
31
+
32
+ reductions.init_reductions()
33
+
34
+
35
+ # The signature has not been changed for years, and we will not need this when the next version is released,
36
+ # so it looks safe to use a constant.
37
+ _REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
38
+
39
+
40
+ def _reduce_tensor_modified(*args, **kwargs):
41
+ output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
42
+ output_args = _modify_tuple(
43
+ output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
44
+ )
45
+ return output_fn, output_args
46
+
47
+
48
+ def _rebuild_cuda_tensor_modified(*args):
49
+ args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
50
+ return reductions._rebuild_cuda_tensor_original(*args)
51
+
52
+
53
+ def _device_to_uuid(device: int) -> str:
54
+ return str(torch.cuda.get_device_properties(device).uuid)
55
+
56
+
57
+ def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
58
+ if isinstance(device_maybe_uuid, int):
59
+ return device_maybe_uuid
60
+
61
+ if isinstance(device_maybe_uuid, str):
62
+ for device in range(torch.cuda.device_count()):
63
+ if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
64
+ return device
65
+ raise Exception("Invalid device_uuid=" + device_maybe_uuid)
66
+
67
+ raise Exception(f"Unknown type: {device_maybe_uuid=}")
68
+
69
+
70
+ def _modify_tuple(t, index: int, modifier: Callable):
71
+ return *t[:index], modifier(t[index]), *t[index + 1 :]