sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +72 -10
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +6 -16
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +582 -125
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
- sglang/srt/layers/moe/topk.py +79 -6
- sglang/srt/layers/quantization/__init__.py +137 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/moe_wna16.py +501 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +44 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -127
- sglang/srt/managers/scheduler.py +29 -23
- sglang/srt/managers/tokenizer_manager.py +1 -2
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +16 -13
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +64 -59
- sglang/srt/model_loader/loader.py +19 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +568 -0
- sglang/srt/models/deepseek_janus_pro.py +12 -17
- sglang/srt/models/deepseek_v2.py +339 -123
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +20 -80
- sglang/srt/models/llama.py +4 -1
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +106 -93
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +120 -25
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +94 -25
- sglang/srt/utils.py +137 -51
- sglang/test/runners.py +27 -2
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +14 -27
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2_vl.py
CHANGED
@@ -45,7 +45,7 @@ from sglang.srt.managers.mm_utils import (
|
|
45
45
|
MultiModalityDataPaddingPatternTokenPairs,
|
46
46
|
general_mm_embed_routine,
|
47
47
|
)
|
48
|
-
from sglang.srt.managers.schedule_batch import MultimodalInputs
|
48
|
+
from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
|
49
49
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
50
50
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
51
51
|
from sglang.srt.models.qwen2 import Qwen2Model
|
@@ -472,18 +472,24 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|
472
472
|
|
473
473
|
# Use grid_t * grid_w * grid_h to pad tokens for each image
|
474
474
|
# add replaced padding by unique image hash
|
475
|
-
def pad_input_ids(self, input_ids: List[int],
|
475
|
+
def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
|
476
476
|
# Get all special token IDs
|
477
|
-
im_start_id: int =
|
478
|
-
im_end_id: int =
|
477
|
+
im_start_id: int = mm_inputs.im_start_id
|
478
|
+
im_end_id: int = mm_inputs.im_end_id
|
479
479
|
|
480
480
|
media_token_pairs = [(im_start_id, im_end_id)]
|
481
481
|
pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
|
482
|
-
return pattern.pad_input_tokens(input_ids,
|
482
|
+
return pattern.pad_input_tokens(input_ids, mm_inputs)
|
483
483
|
|
484
|
-
def get_image_feature(self,
|
485
|
-
|
486
|
-
|
484
|
+
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
485
|
+
# in qwen-vl, last dim is the same
|
486
|
+
pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type(
|
487
|
+
self.visual.dtype
|
488
|
+
)
|
489
|
+
image_grid_thws = torch.concat([item.image_grid_thws for item in items], dim=0)
|
490
|
+
assert pixel_values.dim() == 2, pixel_values.dim()
|
491
|
+
assert image_grid_thws.dim() == 2, image_grid_thws.dim()
|
492
|
+
image_embeds = self.visual(pixel_values, grid_thw=image_grid_thws)
|
487
493
|
return image_embeds
|
488
494
|
|
489
495
|
def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
|
@@ -527,27 +533,20 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|
527
533
|
"multimodal section rotary embedding requires "
|
528
534
|
f"(3, seq_len) positions, but got {positions.size()}"
|
529
535
|
)
|
530
|
-
|
531
|
-
inputs_embeds = general_mm_embed_routine(
|
536
|
+
hidden_states = general_mm_embed_routine(
|
532
537
|
input_ids=input_ids,
|
533
538
|
forward_batch=forward_batch,
|
534
|
-
|
535
|
-
|
536
|
-
)
|
537
|
-
|
538
|
-
hidden_states = self.model(
|
539
|
-
input_ids=None,
|
539
|
+
language_model=self.model,
|
540
|
+
image_data_embedding_func=self.get_image_feature,
|
540
541
|
positions=positions,
|
541
|
-
forward_batch=forward_batch,
|
542
|
-
input_embeds=inputs_embeds,
|
543
542
|
)
|
544
543
|
|
545
|
-
if
|
544
|
+
if get_embedding:
|
545
|
+
return self.pooler(hidden_states, forward_batch)
|
546
|
+
else:
|
546
547
|
return self.logits_processor(
|
547
548
|
input_ids, hidden_states, self.lm_head, forward_batch
|
548
549
|
)
|
549
|
-
else:
|
550
|
-
return self.pooler(hidden_states, forward_batch)
|
551
550
|
|
552
551
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
553
552
|
stacked_params_mapping = [
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -20,7 +20,7 @@ import os
|
|
20
20
|
import time
|
21
21
|
import uuid
|
22
22
|
from http import HTTPStatus
|
23
|
-
from typing import
|
23
|
+
from typing import Dict, List
|
24
24
|
|
25
25
|
from fastapi import HTTPException, Request, UploadFile
|
26
26
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
@@ -645,7 +645,7 @@ def v1_generate_response(
|
|
645
645
|
"index": 0,
|
646
646
|
"text": text,
|
647
647
|
"logprobs": logprobs,
|
648
|
-
"finish_reason":
|
648
|
+
"finish_reason": finish_reason["type"] if finish_reason else None,
|
649
649
|
"matched_stop": (
|
650
650
|
finish_reason["matched"]
|
651
651
|
if finish_reason and "matched" in finish_reason
|
@@ -657,7 +657,7 @@ def v1_generate_response(
|
|
657
657
|
index=idx,
|
658
658
|
text=text,
|
659
659
|
logprobs=logprobs,
|
660
|
-
finish_reason=
|
660
|
+
finish_reason=finish_reason["type"] if finish_reason else None,
|
661
661
|
matched_stop=(
|
662
662
|
finish_reason["matched"]
|
663
663
|
if finish_reason and "matched" in finish_reason
|
@@ -805,7 +805,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
805
805
|
index=index,
|
806
806
|
text=delta,
|
807
807
|
logprobs=logprobs,
|
808
|
-
finish_reason=
|
808
|
+
finish_reason=finish_reason["type"] if finish_reason else None,
|
809
809
|
matched_stop=(
|
810
810
|
finish_reason["matched"]
|
811
811
|
if finish_reason and "matched" in finish_reason
|
@@ -897,6 +897,7 @@ def v1_chat_generate_request(
|
|
897
897
|
request_ids: List[str] = None,
|
898
898
|
):
|
899
899
|
input_ids = []
|
900
|
+
prompts = []
|
900
901
|
sampling_params_list = []
|
901
902
|
image_data_list = []
|
902
903
|
audio_data_list = []
|
@@ -916,6 +917,7 @@ def v1_chat_generate_request(
|
|
916
917
|
# - audio_data: None or a list of audio strings (URLs).
|
917
918
|
# None skips any image processing in GenerateReqInput.
|
918
919
|
strict_tag = None
|
920
|
+
prompt = ""
|
919
921
|
if not isinstance(request.messages, str):
|
920
922
|
# Apply chat template and its stop strings.
|
921
923
|
tools = None
|
@@ -1005,11 +1007,13 @@ def v1_chat_generate_request(
|
|
1005
1007
|
image_data = None
|
1006
1008
|
audio_data = None
|
1007
1009
|
modalities = []
|
1010
|
+
prompt = request.messages
|
1008
1011
|
input_ids.append(prompt_ids)
|
1009
1012
|
return_logprobs.append(request.logprobs)
|
1010
1013
|
logprob_start_lens.append(-1)
|
1011
1014
|
top_logprobs_nums.append(request.top_logprobs or 0)
|
1012
1015
|
lora_paths.append(request.lora_path)
|
1016
|
+
prompts.append(prompt)
|
1013
1017
|
|
1014
1018
|
sampling_params = {
|
1015
1019
|
"temperature": request.temperature,
|
@@ -1063,10 +1067,14 @@ def v1_chat_generate_request(
|
|
1063
1067
|
audio_data_list.append(audio_data)
|
1064
1068
|
modalities_list.append(modalities)
|
1065
1069
|
if len(all_requests) == 1:
|
1066
|
-
if
|
1067
|
-
|
1070
|
+
if tokenizer_manager.model_config.is_multimodal:
|
1071
|
+
# processor will need text input
|
1072
|
+
prompt_kwargs = {"text": prompts[0]}
|
1068
1073
|
else:
|
1069
|
-
|
1074
|
+
if isinstance(input_ids[0], str):
|
1075
|
+
prompt_kwargs = {"text": input_ids[0]}
|
1076
|
+
else:
|
1077
|
+
prompt_kwargs = {"input_ids": input_ids[0]}
|
1070
1078
|
sampling_params_list = sampling_params_list[0]
|
1071
1079
|
image_data_list = image_data_list[0]
|
1072
1080
|
audio_data_list = audio_data_list[0]
|
@@ -1076,10 +1084,14 @@ def v1_chat_generate_request(
|
|
1076
1084
|
modalities_list = modalities_list[0]
|
1077
1085
|
lora_paths = lora_paths[0]
|
1078
1086
|
else:
|
1079
|
-
if
|
1080
|
-
|
1087
|
+
if tokenizer_manager.model_config.is_multimodal:
|
1088
|
+
# processor will need text input
|
1089
|
+
prompt_kwargs = {"text": prompts}
|
1081
1090
|
else:
|
1082
|
-
|
1091
|
+
if isinstance(input_ids[0], str):
|
1092
|
+
prompt_kwargs = {"text": input_ids}
|
1093
|
+
else:
|
1094
|
+
prompt_kwargs = {"input_ids": input_ids}
|
1083
1095
|
|
1084
1096
|
adapted_request = GenerateReqInput(
|
1085
1097
|
**prompt_kwargs,
|
@@ -1119,7 +1131,9 @@ def v1_chat_generate_response(
|
|
1119
1131
|
if logprobs:
|
1120
1132
|
logprobs = to_openai_style_logprobs(
|
1121
1133
|
output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
|
1122
|
-
output_top_logprobs=ret_item["meta_info"]
|
1134
|
+
output_top_logprobs=ret_item["meta_info"].get(
|
1135
|
+
"output_top_logprobs", None
|
1136
|
+
),
|
1123
1137
|
)
|
1124
1138
|
token_logprobs = []
|
1125
1139
|
for token_idx, (token, logprob) in enumerate(
|
@@ -1216,7 +1230,7 @@ def v1_chat_generate_response(
|
|
1216
1230
|
"reasoning_content": reasoning_text if reasoning_text else None,
|
1217
1231
|
},
|
1218
1232
|
"logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
|
1219
|
-
"finish_reason":
|
1233
|
+
"finish_reason": finish_reason["type"] if finish_reason else None,
|
1220
1234
|
"matched_stop": (
|
1221
1235
|
finish_reason["matched"]
|
1222
1236
|
if finish_reason and "matched" in finish_reason
|
@@ -1233,7 +1247,7 @@ def v1_chat_generate_response(
|
|
1233
1247
|
reasoning_content=reasoning_text if reasoning_text else None,
|
1234
1248
|
),
|
1235
1249
|
logprobs=choice_logprobs,
|
1236
|
-
finish_reason=
|
1250
|
+
finish_reason=finish_reason["type"] if finish_reason else None,
|
1237
1251
|
matched_stop=(
|
1238
1252
|
finish_reason["matched"]
|
1239
1253
|
if finish_reason and "matched" in finish_reason
|
@@ -1329,9 +1343,9 @@ async def v1_chat_completions(
|
|
1329
1343
|
output_token_logprobs=content["meta_info"][
|
1330
1344
|
"output_token_logprobs"
|
1331
1345
|
][n_prev_token:],
|
1332
|
-
output_top_logprobs=content["meta_info"]
|
1333
|
-
"output_top_logprobs"
|
1334
|
-
|
1346
|
+
output_top_logprobs=content["meta_info"].get(
|
1347
|
+
"output_top_logprobs", []
|
1348
|
+
)[n_prev_token:],
|
1335
1349
|
)
|
1336
1350
|
|
1337
1351
|
n_prev_token = len(
|
@@ -1377,23 +1391,11 @@ async def v1_chat_completions(
|
|
1377
1391
|
if is_first:
|
1378
1392
|
# First chunk with role
|
1379
1393
|
is_first = False
|
1380
|
-
|
1381
|
-
tokenizer_manager.server_args.reasoning_parser
|
1382
|
-
and request.separate_reasoning
|
1383
|
-
):
|
1384
|
-
delta = DeltaMessage(
|
1385
|
-
role="assistant", reasoning_content=None
|
1386
|
-
)
|
1387
|
-
else:
|
1388
|
-
delta = DeltaMessage(role="assistant", content=None)
|
1394
|
+
delta = DeltaMessage(role="assistant")
|
1389
1395
|
choice_data = ChatCompletionResponseStreamChoice(
|
1390
1396
|
index=index,
|
1391
1397
|
delta=delta,
|
1392
|
-
finish_reason=
|
1393
|
-
None
|
1394
|
-
if finish_reason_type and len(finish_reason_type) == 0
|
1395
|
-
else finish_reason_type
|
1396
|
-
),
|
1398
|
+
finish_reason=finish_reason_type,
|
1397
1399
|
matched_stop=(
|
1398
1400
|
finish_reason["matched"]
|
1399
1401
|
if finish_reason and "matched" in finish_reason
|
@@ -1434,12 +1436,7 @@ async def v1_chat_completions(
|
|
1434
1436
|
reasoning_text if reasoning_text else None
|
1435
1437
|
)
|
1436
1438
|
),
|
1437
|
-
finish_reason=
|
1438
|
-
None
|
1439
|
-
if finish_reason_type
|
1440
|
-
and len(finish_reason_type) == 0
|
1441
|
-
else finish_reason_type
|
1442
|
-
),
|
1439
|
+
finish_reason=finish_reason_type,
|
1443
1440
|
)
|
1444
1441
|
chunk = ChatCompletionStreamResponse(
|
1445
1442
|
id=content["meta_info"]["id"],
|
@@ -1471,12 +1468,7 @@ async def v1_chat_completions(
|
|
1471
1468
|
delta=DeltaMessage(
|
1472
1469
|
content=normal_text if normal_text else None
|
1473
1470
|
),
|
1474
|
-
finish_reason=
|
1475
|
-
None
|
1476
|
-
if finish_reason_type
|
1477
|
-
and len(finish_reason_type) == 0
|
1478
|
-
else finish_reason_type
|
1479
|
-
),
|
1471
|
+
finish_reason=finish_reason_type,
|
1480
1472
|
)
|
1481
1473
|
chunk = ChatCompletionStreamResponse(
|
1482
1474
|
id=content["meta_info"]["id"],
|
@@ -1490,11 +1482,7 @@ async def v1_chat_completions(
|
|
1490
1482
|
for call_item in calls:
|
1491
1483
|
# transform call_item -> FunctionResponse + ToolCall
|
1492
1484
|
|
1493
|
-
if
|
1494
|
-
content["meta_info"]["finish_reason"]
|
1495
|
-
and content["meta_info"]["finish_reason"]["type"]
|
1496
|
-
== "stop"
|
1497
|
-
):
|
1485
|
+
if finish_reason_type == "stop":
|
1498
1486
|
latest_delta_len = 0
|
1499
1487
|
if isinstance(call_item.parameters, str):
|
1500
1488
|
latest_delta_len = len(call_item.parameters)
|
@@ -1515,6 +1503,8 @@ async def v1_chat_completions(
|
|
1515
1503
|
)
|
1516
1504
|
call_item.parameters = remaining_call
|
1517
1505
|
|
1506
|
+
finish_reason_type = "tool_calls"
|
1507
|
+
|
1518
1508
|
tool_call = ToolCall(
|
1519
1509
|
id=str(call_item.tool_index),
|
1520
1510
|
function=FunctionResponse(
|
@@ -1524,10 +1514,13 @@ async def v1_chat_completions(
|
|
1524
1514
|
)
|
1525
1515
|
choice_data = ChatCompletionResponseStreamChoice(
|
1526
1516
|
index=index,
|
1527
|
-
delta=DeltaMessage(
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1517
|
+
delta=DeltaMessage(tool_calls=[tool_call]),
|
1518
|
+
finish_reason=(
|
1519
|
+
None
|
1520
|
+
if request.stream_options
|
1521
|
+
and request.stream_options.include_usage
|
1522
|
+
else finish_reason_type
|
1523
|
+
), # additional chunk will be return
|
1531
1524
|
)
|
1532
1525
|
chunk = ChatCompletionStreamResponse(
|
1533
1526
|
id=content["meta_info"]["id"],
|
@@ -1542,30 +1535,44 @@ async def v1_chat_completions(
|
|
1542
1535
|
|
1543
1536
|
else:
|
1544
1537
|
# No tool calls => just treat this as normal text
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1551
|
-
else
|
1552
|
-
|
1553
|
-
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
|
1560
|
-
|
1561
|
-
|
1562
|
-
|
1563
|
-
|
1564
|
-
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1538
|
+
if delta or not (
|
1539
|
+
request.stream_options
|
1540
|
+
and request.stream_options.include_usage
|
1541
|
+
):
|
1542
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
1543
|
+
index=index,
|
1544
|
+
delta=DeltaMessage(content=delta if delta else None),
|
1545
|
+
finish_reason=(
|
1546
|
+
None
|
1547
|
+
if request.stream_options
|
1548
|
+
and request.stream_options.include_usage
|
1549
|
+
else finish_reason_type
|
1550
|
+
),
|
1551
|
+
matched_stop=(
|
1552
|
+
finish_reason["matched"]
|
1553
|
+
if finish_reason and "matched" in finish_reason
|
1554
|
+
else None
|
1555
|
+
),
|
1556
|
+
logprobs=choice_logprobs,
|
1557
|
+
)
|
1558
|
+
chunk = ChatCompletionStreamResponse(
|
1559
|
+
id=content["meta_info"]["id"],
|
1560
|
+
created=created,
|
1561
|
+
choices=[choice_data],
|
1562
|
+
model=request.model,
|
1563
|
+
)
|
1564
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
1565
|
+
stream_buffers[index] = new_stream_buffer
|
1566
|
+
is_firsts[index] = is_first
|
1567
|
+
if finish_reason_type == "stop" and request.tool_choice != "none":
|
1568
|
+
parser = FunctionCallParser(
|
1569
|
+
tools=request.tools,
|
1570
|
+
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
1571
|
+
)
|
1572
|
+
if parser.has_tool_call(new_stream_buffer):
|
1573
|
+
# if the stream ends with empty string after tool calls
|
1574
|
+
finish_reason_type = "tool_calls"
|
1575
|
+
|
1569
1576
|
if request.stream_options and request.stream_options.include_usage:
|
1570
1577
|
total_prompt_tokens = sum(
|
1571
1578
|
tokens
|
@@ -1590,17 +1597,22 @@ async def v1_chat_completions(
|
|
1590
1597
|
prompt_tokens_details=prompt_tokens_details,
|
1591
1598
|
)
|
1592
1599
|
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1599
|
-
|
1600
|
-
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1600
|
+
else:
|
1601
|
+
usage = None
|
1602
|
+
final_usage_chunk = ChatCompletionStreamResponse(
|
1603
|
+
id=content["meta_info"]["id"],
|
1604
|
+
created=created,
|
1605
|
+
choices=[
|
1606
|
+
ChatCompletionResponseStreamChoice(
|
1607
|
+
index=index,
|
1608
|
+
delta=DeltaMessage(),
|
1609
|
+
finish_reason=finish_reason_type,
|
1610
|
+
)
|
1611
|
+
],
|
1612
|
+
model=request.model,
|
1613
|
+
usage=usage,
|
1614
|
+
)
|
1615
|
+
yield f"data: {final_usage_chunk.model_dump_json()}\n\n"
|
1604
1616
|
except ValueError as e:
|
1605
1617
|
error = create_streaming_error_response(str(e))
|
1606
1618
|
yield f"data: {error}\n\n"
|
@@ -1653,18 +1665,19 @@ def v1_embedding_request(all_requests, tokenizer_manager):
|
|
1653
1665
|
elif isinstance(prompt, list) and isinstance(
|
1654
1666
|
prompt[0], MultimodalEmbeddingInput
|
1655
1667
|
):
|
1656
|
-
assert (
|
1657
|
-
chat_template_name is not None
|
1658
|
-
), "chat_template_name is required for multimodal inputs"
|
1659
1668
|
texts = []
|
1660
1669
|
images = []
|
1661
1670
|
for item in prompt:
|
1662
|
-
|
1671
|
+
# TODO simply use padding for text, we should use a better way to handle this
|
1672
|
+
texts.append(item.text if item.text is not None else "padding")
|
1663
1673
|
images.append(item.image if item.image is not None else None)
|
1664
|
-
convs = generate_embedding_convs(texts, images, chat_template_name)
|
1665
1674
|
generate_prompts = []
|
1666
|
-
|
1667
|
-
|
1675
|
+
if chat_template_name is not None:
|
1676
|
+
convs = generate_embedding_convs(texts, images, chat_template_name)
|
1677
|
+
for conv in convs:
|
1678
|
+
generate_prompts.append(conv.get_prompt())
|
1679
|
+
else:
|
1680
|
+
generate_prompts = texts
|
1668
1681
|
if len(generate_prompts) == 1:
|
1669
1682
|
prompt_kwargs = {"text": generate_prompts[0], "image_data": images[0]}
|
1670
1683
|
else:
|
@@ -28,6 +28,7 @@ class ModelCard(BaseModel):
|
|
28
28
|
created: int = Field(default_factory=lambda: int(time.time()))
|
29
29
|
owned_by: str = "sglang"
|
30
30
|
root: Optional[str] = None
|
31
|
+
max_model_len: Optional[int] = None
|
31
32
|
|
32
33
|
|
33
34
|
class ModelList(BaseModel):
|
@@ -187,7 +188,7 @@ class CompletionResponseChoice(BaseModel):
|
|
187
188
|
index: int
|
188
189
|
text: str
|
189
190
|
logprobs: Optional[LogProbs] = None
|
190
|
-
finish_reason:
|
191
|
+
finish_reason: Literal["stop", "length", "content_filter"]
|
191
192
|
matched_stop: Union[None, int, str] = None
|
192
193
|
|
193
194
|
|
@@ -204,7 +205,7 @@ class CompletionResponseStreamChoice(BaseModel):
|
|
204
205
|
index: int
|
205
206
|
text: str
|
206
207
|
logprobs: Optional[LogProbs] = None
|
207
|
-
finish_reason: Optional[
|
208
|
+
finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
|
208
209
|
matched_stop: Union[None, int, str] = None
|
209
210
|
|
210
211
|
|
@@ -322,7 +323,7 @@ class ChatCompletionRequest(BaseModel):
|
|
322
323
|
max_tokens: Optional[int] = None
|
323
324
|
n: int = 1
|
324
325
|
presence_penalty: float = 0.0
|
325
|
-
response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
|
326
|
+
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
|
326
327
|
seed: Optional[int] = None
|
327
328
|
stop: Optional[Union[str, List[str]]] = None
|
328
329
|
stream: bool = False
|
@@ -387,7 +388,9 @@ class ChatCompletionResponseChoice(BaseModel):
|
|
387
388
|
index: int
|
388
389
|
message: ChatMessage
|
389
390
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
390
|
-
finish_reason:
|
391
|
+
finish_reason: Literal[
|
392
|
+
"stop", "length", "tool_calls", "content_filter", "function_call"
|
393
|
+
]
|
391
394
|
matched_stop: Union[None, int, str] = None
|
392
395
|
|
393
396
|
|
@@ -411,7 +414,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
411
414
|
index: int
|
412
415
|
delta: DeltaMessage
|
413
416
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
414
|
-
finish_reason: Optional[
|
417
|
+
finish_reason: Optional[
|
418
|
+
Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
|
419
|
+
] = None
|
415
420
|
matched_stop: Union[None, int, str] = None
|
416
421
|
|
417
422
|
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
from typing import Callable, Union
|
15
|
+
|
16
|
+
import torch
|
17
|
+
from torch.multiprocessing import reductions
|
18
|
+
|
19
|
+
|
20
|
+
def monkey_patch_torch_reductions():
|
21
|
+
"""Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
|
22
|
+
|
23
|
+
if hasattr(reductions, "_reduce_tensor_original"):
|
24
|
+
return
|
25
|
+
|
26
|
+
reductions._reduce_tensor_original = reductions.reduce_tensor
|
27
|
+
reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
|
28
|
+
|
29
|
+
reductions.reduce_tensor = _reduce_tensor_modified
|
30
|
+
reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
|
31
|
+
|
32
|
+
reductions.init_reductions()
|
33
|
+
|
34
|
+
|
35
|
+
# The signature has not been changed for years, and we will not need this when the next version is released,
|
36
|
+
# so it looks safe to use a constant.
|
37
|
+
_REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
|
38
|
+
|
39
|
+
|
40
|
+
def _reduce_tensor_modified(*args, **kwargs):
|
41
|
+
output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
|
42
|
+
output_args = _modify_tuple(
|
43
|
+
output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
|
44
|
+
)
|
45
|
+
return output_fn, output_args
|
46
|
+
|
47
|
+
|
48
|
+
def _rebuild_cuda_tensor_modified(*args):
|
49
|
+
args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
|
50
|
+
return reductions._rebuild_cuda_tensor_original(*args)
|
51
|
+
|
52
|
+
|
53
|
+
def _device_to_uuid(device: int) -> str:
|
54
|
+
return str(torch.cuda.get_device_properties(device).uuid)
|
55
|
+
|
56
|
+
|
57
|
+
def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
|
58
|
+
if isinstance(device_maybe_uuid, int):
|
59
|
+
return device_maybe_uuid
|
60
|
+
|
61
|
+
if isinstance(device_maybe_uuid, str):
|
62
|
+
for device in range(torch.cuda.device_count()):
|
63
|
+
if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
|
64
|
+
return device
|
65
|
+
raise Exception("Invalid device_uuid=" + device_maybe_uuid)
|
66
|
+
|
67
|
+
raise Exception(f"Unknown type: {device_maybe_uuid=}")
|
68
|
+
|
69
|
+
|
70
|
+
def _modify_tuple(t, index: int, modifier: Callable):
|
71
|
+
return *t[:index], modifier(t[index]), *t[index + 1 :]
|