sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +4 -2
- sglang/bench_one_batch.py +3 -13
- sglang/bench_one_batch_server.py +143 -15
- sglang/bench_serving.py +158 -8
- sglang/compile_deep_gemm.py +1 -1
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +119 -75
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +5 -2
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/internvl.py +696 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +18 -0
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +71 -53
- sglang/srt/conversation.py +78 -46
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +11 -3
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +74 -23
- sglang/srt/disaggregation/mooncake/conn.py +236 -138
- sglang/srt/disaggregation/nixl/conn.py +242 -71
- sglang/srt/disaggregation/prefill.py +7 -4
- sglang/srt/disaggregation/utils.py +51 -2
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
- sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
- sglang/srt/distributed/device_communicators/pynccl.py +2 -1
- sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
- sglang/srt/distributed/parallel_state.py +22 -1
- sglang/srt/entrypoints/engine.py +31 -4
- sglang/srt/entrypoints/http_server.py +45 -3
- sglang/srt/entrypoints/verl_engine.py +3 -2
- sglang/srt/function_call_parser.py +2 -2
- sglang/srt/hf_transformers_utils.py +20 -1
- sglang/srt/layers/attention/flashattention_backend.py +147 -51
- sglang/srt/layers/attention/flashinfer_backend.py +23 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
- sglang/srt/layers/attention/merge_state.py +46 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
- sglang/srt/layers/attention/utils.py +4 -2
- sglang/srt/layers/attention/vision.py +290 -163
- sglang/srt/layers/dp_attention.py +71 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
- sglang/srt/layers/moe/ep_moe/layer.py +121 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
- sglang/srt/layers/quantization/deep_gemm.py +77 -71
- sglang/srt/layers/quantization/fp8.py +110 -97
- sglang/srt/layers/quantization/fp8_kernel.py +81 -62
- sglang/srt/layers/quantization/fp8_utils.py +71 -23
- sglang/srt/layers/quantization/int8_kernel.py +2 -2
- sglang/srt/layers/quantization/kv_cache.py +3 -10
- sglang/srt/layers/quantization/utils.py +0 -5
- sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +11 -14
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/cache_controller.py +115 -119
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/io_struct.py +13 -1
- sglang/srt/managers/mm_utils.py +1 -1
- sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
- sglang/srt/managers/multimodal_processors/internvl.py +232 -0
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/schedule_batch.py +93 -23
- sglang/srt/managers/schedule_policy.py +11 -8
- sglang/srt/managers/scheduler.py +140 -100
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/tokenizer_manager.py +157 -47
- sglang/srt/managers/tp_worker.py +21 -21
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/chunk_cache.py +2 -0
- sglang/srt/mem_cache/memory_pool.py +4 -2
- sglang/srt/metrics/collector.py +312 -37
- sglang/srt/model_executor/cuda_graph_runner.py +10 -11
- sglang/srt/model_executor/forward_batch_info.py +1 -1
- sglang/srt/model_executor/model_runner.py +57 -41
- sglang/srt/model_loader/loader.py +18 -11
- sglang/srt/models/clip.py +4 -4
- sglang/srt/models/deepseek_janus_pro.py +3 -3
- sglang/srt/models/deepseek_nextn.py +1 -20
- sglang/srt/models/deepseek_v2.py +77 -39
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/internlm2.py +3 -0
- sglang/srt/models/internvl.py +670 -0
- sglang/srt/models/llama.py +3 -1
- sglang/srt/models/llama4.py +58 -13
- sglang/srt/models/llava.py +248 -5
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/phi3_small.py +16 -2
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2_5_vl.py +8 -4
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/xiaomi_mimo.py +171 -0
- sglang/srt/openai_api/adapter.py +52 -42
- sglang/srt/openai_api/protocol.py +20 -16
- sglang/srt/reasoning_parser.py +1 -1
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +2 -2
- sglang/srt/sampling/sampling_params.py +2 -0
- sglang/srt/server_args.py +64 -10
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +7 -7
- sglang/srt/speculative/eagle_worker.py +22 -19
- sglang/srt/utils.py +41 -6
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deepep_utils.py +219 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +92 -15
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
import argparse
|
2
|
+
import asyncio
|
3
|
+
import os
|
4
|
+
import pickle
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import List
|
7
|
+
|
8
|
+
import openai
|
9
|
+
import torch
|
10
|
+
from bert_score import BERTScorer
|
11
|
+
from datasets import load_dataset
|
12
|
+
from tqdm import tqdm
|
13
|
+
|
14
|
+
|
15
|
+
def get_client(api_url: str) -> openai.AsyncOpenAI:
|
16
|
+
if os.getenv("OPENAI_API_KEY") is None:
|
17
|
+
os.environ["OPENAI_API_KEY"] = "EMPTY"
|
18
|
+
return openai.AsyncOpenAI(base_url=api_url)
|
19
|
+
|
20
|
+
|
21
|
+
def get_dataset():
|
22
|
+
return load_dataset("bigai-nlco/LooGLE", "longdep_qa", split="test")
|
23
|
+
|
24
|
+
|
25
|
+
async def fetch_response(
|
26
|
+
client: openai.AsyncOpenAI,
|
27
|
+
context: str,
|
28
|
+
question: str,
|
29
|
+
semaphore: asyncio.Semaphore,
|
30
|
+
index: int,
|
31
|
+
model: str,
|
32
|
+
output_dir: Path,
|
33
|
+
):
|
34
|
+
output_file = output_dir / f"response_{index}.pkl"
|
35
|
+
if output_file.exists():
|
36
|
+
return
|
37
|
+
|
38
|
+
prompt = (
|
39
|
+
"Please answer the question based on the long texts below.\n"
|
40
|
+
f"{context}\n"
|
41
|
+
f"Question: {question}\n"
|
42
|
+
"Answer:"
|
43
|
+
)
|
44
|
+
messages = [
|
45
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
46
|
+
{"role": "user", "content": prompt},
|
47
|
+
]
|
48
|
+
|
49
|
+
async with semaphore:
|
50
|
+
try:
|
51
|
+
response = await client.chat.completions.create(
|
52
|
+
model=model,
|
53
|
+
messages=messages,
|
54
|
+
temperature=0.0,
|
55
|
+
max_tokens=512,
|
56
|
+
)
|
57
|
+
except openai.BadRequestError as e:
|
58
|
+
with open(output_file, "wb") as f:
|
59
|
+
pickle.dump({"error": str(e)}, f)
|
60
|
+
return
|
61
|
+
|
62
|
+
with open(output_file, "wb") as f:
|
63
|
+
pickle.dump(response, f)
|
64
|
+
|
65
|
+
|
66
|
+
async def benchmark(args):
|
67
|
+
dataset = get_dataset()
|
68
|
+
output_dir = Path(args.output_dir)
|
69
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
70
|
+
|
71
|
+
client = get_client(args.api_url)
|
72
|
+
semaphore = asyncio.Semaphore(args.max_concurrency)
|
73
|
+
|
74
|
+
tasks: List[asyncio.Task] = []
|
75
|
+
for idx, ex in enumerate(dataset):
|
76
|
+
tasks.append(
|
77
|
+
asyncio.create_task(
|
78
|
+
fetch_response(
|
79
|
+
client,
|
80
|
+
ex["context"],
|
81
|
+
ex["question"],
|
82
|
+
semaphore,
|
83
|
+
idx,
|
84
|
+
args.model,
|
85
|
+
output_dir,
|
86
|
+
)
|
87
|
+
)
|
88
|
+
)
|
89
|
+
|
90
|
+
for _ in tqdm(
|
91
|
+
asyncio.as_completed(tasks), total=len(tasks), desc="Running benchmark"
|
92
|
+
):
|
93
|
+
await _
|
94
|
+
|
95
|
+
|
96
|
+
def analyse(args):
|
97
|
+
dataset = get_dataset()
|
98
|
+
output_dir = Path(args.output_dir)
|
99
|
+
|
100
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
101
|
+
scorer = BERTScorer(lang="en", device=device)
|
102
|
+
|
103
|
+
hyps: List[str] = []
|
104
|
+
refs: List[str] = []
|
105
|
+
for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
|
106
|
+
pkl_file = output_dir / f"response_{idx}.pkl"
|
107
|
+
if not pkl_file.exists():
|
108
|
+
raise FileNotFoundError(pkl_file)
|
109
|
+
|
110
|
+
response = pickle.load(open(pkl_file, "rb"))
|
111
|
+
if isinstance(response, dict) and "error" in response:
|
112
|
+
continue
|
113
|
+
|
114
|
+
hyps.append(response.choices[0].message.content.strip())
|
115
|
+
refs.append(ex["answer"])
|
116
|
+
|
117
|
+
if not hyps:
|
118
|
+
print("No valid responses to score!")
|
119
|
+
return
|
120
|
+
|
121
|
+
batch_size = 64
|
122
|
+
all_f1: List[float] = []
|
123
|
+
for i in tqdm(range(0, len(hyps), batch_size), desc="Scoring batches"):
|
124
|
+
h_batch = hyps[i : i + batch_size]
|
125
|
+
r_batch = refs[i : i + batch_size]
|
126
|
+
_, _, f1_scores = scorer.score(h_batch, r_batch, verbose=False)
|
127
|
+
all_f1.extend([float(x) for x in f1_scores])
|
128
|
+
|
129
|
+
avg = sum(all_f1) / len(all_f1)
|
130
|
+
print(f"Average BERTScore (F1): {avg:.2%}")
|
131
|
+
|
132
|
+
|
133
|
+
if __name__ == "__main__":
|
134
|
+
parser = argparse.ArgumentParser(
|
135
|
+
description="Run benchmark and evaluation in one go."
|
136
|
+
)
|
137
|
+
parser.add_argument(
|
138
|
+
"--api-url",
|
139
|
+
default="http://127.0.0.1:30000/v1",
|
140
|
+
help="OpenAI‑compatible API base URL",
|
141
|
+
)
|
142
|
+
parser.add_argument(
|
143
|
+
"--model",
|
144
|
+
default="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
|
145
|
+
help="Model name or ID, only used for model name",
|
146
|
+
)
|
147
|
+
parser.add_argument(
|
148
|
+
"--max-concurrency", type=int, default=144, help="Maximum concurrent requests"
|
149
|
+
)
|
150
|
+
parser.add_argument(
|
151
|
+
"--output-dir", default="tmp-output-dir", help="Directory for cached responses"
|
152
|
+
)
|
153
|
+
args = parser.parse_args()
|
154
|
+
|
155
|
+
asyncio.run(benchmark(args))
|
156
|
+
|
157
|
+
analyse(args)
|
sglang/lang/chat_template.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import re
|
1
2
|
from dataclasses import dataclass
|
2
3
|
from enum import Enum, auto
|
3
4
|
from typing import Callable, Dict, List, Tuple
|
@@ -71,9 +72,9 @@ def get_chat_template(name):
|
|
71
72
|
|
72
73
|
def get_chat_template_by_model_path(model_path):
|
73
74
|
for matching_func in matching_function_registry:
|
74
|
-
|
75
|
-
if
|
76
|
-
return
|
75
|
+
template_name = matching_func(model_path)
|
76
|
+
if template_name is not None:
|
77
|
+
return get_chat_template(template_name)
|
77
78
|
return get_chat_template("default")
|
78
79
|
|
79
80
|
|
@@ -193,6 +194,21 @@ register_chat_template(
|
|
193
194
|
)
|
194
195
|
)
|
195
196
|
|
197
|
+
# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
|
198
|
+
register_chat_template(
|
199
|
+
ChatTemplate(
|
200
|
+
name="mistral",
|
201
|
+
default_system_prompt=None,
|
202
|
+
role_prefix_and_suffix={
|
203
|
+
"system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
|
204
|
+
"user": ("[INST] ", " [/INST]"),
|
205
|
+
"assistant": ("", " </s><s>"),
|
206
|
+
},
|
207
|
+
stop_str=("</s>",),
|
208
|
+
image_token="[IMG]",
|
209
|
+
)
|
210
|
+
)
|
211
|
+
|
196
212
|
register_chat_template(
|
197
213
|
ChatTemplate(
|
198
214
|
name="llama-3-instruct",
|
@@ -270,6 +286,29 @@ register_chat_template(
|
|
270
286
|
)
|
271
287
|
)
|
272
288
|
|
289
|
+
register_chat_template(
|
290
|
+
ChatTemplate(
|
291
|
+
name="janus",
|
292
|
+
default_system_prompt=None,
|
293
|
+
role_prefix_and_suffix={
|
294
|
+
"system": (
|
295
|
+
"",
|
296
|
+
"",
|
297
|
+
),
|
298
|
+
"user": (
|
299
|
+
"<|User|>",
|
300
|
+
"",
|
301
|
+
),
|
302
|
+
"assistant": (
|
303
|
+
"<|Assistant|>",
|
304
|
+
"<|end▁of▁sentence|>",
|
305
|
+
),
|
306
|
+
},
|
307
|
+
stop_str=("<|end▁of▁sentence|>",),
|
308
|
+
image_token="<image_placeholder>\n",
|
309
|
+
)
|
310
|
+
)
|
311
|
+
|
273
312
|
# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
|
274
313
|
register_chat_template(
|
275
314
|
ChatTemplate(
|
@@ -395,6 +434,20 @@ register_chat_template(
|
|
395
434
|
)
|
396
435
|
)
|
397
436
|
|
437
|
+
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
|
438
|
+
register_chat_template(
|
439
|
+
ChatTemplate(
|
440
|
+
name="internvl-2-5",
|
441
|
+
default_system_prompt="你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
|
442
|
+
role_prefix_and_suffix={
|
443
|
+
"system": ("<|im_start|>system\n", "<|im_end|>\n"),
|
444
|
+
"user": ("<|im_start|>user\n", "<|im_end|>\n"),
|
445
|
+
"assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
|
446
|
+
},
|
447
|
+
stop_str=["<|im_end|>", "<|action_end|>"],
|
448
|
+
)
|
449
|
+
)
|
450
|
+
|
398
451
|
register_chat_template(
|
399
452
|
ChatTemplate(
|
400
453
|
name="granite-3-instruct",
|
@@ -442,127 +495,118 @@ register_chat_template(
|
|
442
495
|
|
443
496
|
@register_chat_template_matching_function
|
444
497
|
def match_deepseek(model_path: str):
|
445
|
-
if (
|
446
|
-
"
|
447
|
-
)
|
448
|
-
return
|
498
|
+
if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
|
499
|
+
r"base", model_path, re.IGNORECASE
|
500
|
+
):
|
501
|
+
return "deepseek-v3"
|
449
502
|
|
450
503
|
|
451
504
|
@register_chat_template_matching_function
|
452
505
|
def match_deepseek_janus_pro(model_path: str):
|
453
|
-
if "janus"
|
454
|
-
return
|
506
|
+
if re.search(r"janus", model_path, re.IGNORECASE):
|
507
|
+
return "janus-pro"
|
455
508
|
|
456
509
|
|
457
510
|
@register_chat_template_matching_function
|
458
511
|
def match_dbrx(model_path: str):
|
459
|
-
if "dbrx"
|
460
|
-
|
512
|
+
if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
|
513
|
+
r"instruct", model_path, re.IGNORECASE
|
514
|
+
):
|
515
|
+
return "dbrx-instruct"
|
461
516
|
|
462
517
|
|
463
518
|
@register_chat_template_matching_function
|
464
519
|
def match_vicuna(model_path: str):
|
465
|
-
if "vicuna"
|
466
|
-
return
|
467
|
-
if "llava-v1.5" in model_path.lower():
|
468
|
-
return get_chat_template("vicuna_v1.1")
|
469
|
-
if "llava-next-video-7b" in model_path.lower():
|
470
|
-
return get_chat_template("vicuna_v1.1")
|
520
|
+
if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
|
521
|
+
return "vicuna_v1.1"
|
471
522
|
|
472
523
|
|
473
524
|
@register_chat_template_matching_function
|
474
525
|
def match_llama2_chat(model_path: str):
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
526
|
+
if re.search(
|
527
|
+
r"llama-2.*chat|codellama.*instruct",
|
528
|
+
model_path,
|
529
|
+
re.IGNORECASE,
|
530
|
+
):
|
531
|
+
return "llama-2-chat"
|
532
|
+
|
533
|
+
|
534
|
+
@register_chat_template_matching_function
|
535
|
+
def match_mistral(model_path: str):
|
536
|
+
if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
|
537
|
+
return "mistral"
|
484
538
|
|
485
539
|
|
486
540
|
@register_chat_template_matching_function
|
487
541
|
def match_llama3_instruct(model_path: str):
|
488
|
-
|
489
|
-
|
490
|
-
return get_chat_template("llama-3-instruct")
|
542
|
+
if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
|
543
|
+
return "llama-3-instruct"
|
491
544
|
|
492
545
|
|
493
546
|
@register_chat_template_matching_function
|
494
547
|
def match_chat_ml(model_path: str):
|
495
|
-
|
496
|
-
|
497
|
-
if "
|
498
|
-
return
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
):
|
508
|
-
return get_chat_template("qwen")
|
509
|
-
if (
|
510
|
-
"llava-v1.6-34b" in model_path
|
511
|
-
or "llava-v1.6-yi-34b" in model_path
|
512
|
-
or "llava-next-video-34b" in model_path
|
513
|
-
or "llava-onevision-qwen2" in model_path
|
548
|
+
if re.search(r"tinyllama", model_path, re.IGNORECASE):
|
549
|
+
return "chatml"
|
550
|
+
if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
|
551
|
+
return "qwen2-vl"
|
552
|
+
if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
|
553
|
+
r"llava", model_path, re.IGNORECASE
|
554
|
+
):
|
555
|
+
return "qwen"
|
556
|
+
if re.search(
|
557
|
+
r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
|
558
|
+
model_path,
|
559
|
+
re.IGNORECASE,
|
514
560
|
):
|
515
|
-
return
|
561
|
+
return "chatml-llava"
|
516
562
|
|
517
563
|
|
518
564
|
@register_chat_template_matching_function
|
519
565
|
def match_chat_yi(model_path: str):
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
566
|
+
if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
|
567
|
+
r"llava", model_path, re.IGNORECASE
|
568
|
+
):
|
569
|
+
return "yi-vl"
|
570
|
+
elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
|
571
|
+
return "yi-1.5"
|
525
572
|
|
526
573
|
|
527
574
|
@register_chat_template_matching_function
|
528
575
|
def match_gemma_it(model_path: str):
|
529
|
-
|
530
|
-
|
531
|
-
return get_chat_template("gemma-it")
|
576
|
+
if re.search(r"gemma.*it", model_path, re.IGNORECASE):
|
577
|
+
return "gemma-it"
|
532
578
|
|
533
579
|
|
534
580
|
@register_chat_template_matching_function
|
535
581
|
def match_openbmb_minicpm(model_path: str):
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
return get_chat_template("minicpmo")
|
582
|
+
if re.search(r"minicpm-v", model_path, re.IGNORECASE):
|
583
|
+
return "minicpmv"
|
584
|
+
elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
|
585
|
+
return "minicpmo"
|
541
586
|
|
542
587
|
|
543
588
|
@register_chat_template_matching_function
|
544
589
|
def match_c4ai_command_r(model_path: str):
|
545
|
-
|
546
|
-
|
547
|
-
return get_chat_template("c4ai-command-r")
|
590
|
+
if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
|
591
|
+
return "c4ai-command-r"
|
548
592
|
|
549
593
|
|
550
594
|
@register_chat_template_matching_function
|
551
595
|
def match_granite_instruct(model_path: str):
|
552
|
-
|
553
|
-
|
554
|
-
# need to be updated. For now, assume that the Granite 3.0
|
555
|
-
# template works across the board.
|
556
|
-
if "granite" in model_path and "instruct" in model_path:
|
557
|
-
return get_chat_template("granite-3-instruct")
|
596
|
+
if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
|
597
|
+
return "granite-3-instruct"
|
558
598
|
|
559
599
|
|
560
600
|
@register_chat_template_matching_function
|
561
601
|
def match_gemma3_instruct(model_path: str):
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
602
|
+
if re.search(r"gemma-3", model_path, re.IGNORECASE):
|
603
|
+
return "gemma-it"
|
604
|
+
|
605
|
+
|
606
|
+
@register_chat_template_matching_function
|
607
|
+
def match_internvl_chat(model_path: str):
|
608
|
+
if re.search(r"internvl2_5", model_path, re.IGNORECASE):
|
609
|
+
return "internvl-2-5"
|
566
610
|
|
567
611
|
|
568
612
|
if __name__ == "__main__":
|
sglang/lang/tracer.py
CHANGED
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
|
|
38
38
|
with TracingScope(tracer):
|
39
39
|
tracer.ret_value = program.func(tracer, **arguments)
|
40
40
|
except (StopTracing, TypeError, AttributeError):
|
41
|
-
# Some exceptions may not be
|
41
|
+
# Some exceptions may not be caught
|
42
42
|
pass
|
43
43
|
|
44
44
|
# Run and cache prefix
|
@@ -48,6 +48,9 @@ class DictOutput(object):
|
|
48
48
|
def __getitem__(self, item):
|
49
49
|
return self.__dict__[item]
|
50
50
|
|
51
|
+
def __contains__(self, key):
|
52
|
+
return key in self.__dict__
|
53
|
+
|
51
54
|
def __setitem__(self, key, value):
|
52
55
|
self.__dict__[key] = value
|
53
56
|
|
@@ -413,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|
413
416
|
h = w = math.ceil(
|
414
417
|
(self.image_size // self.patch_size) / self.downsample_ratio
|
415
418
|
)
|
416
|
-
# global views tokens h * (w + 1), 1 is for line
|
419
|
+
# global views tokens h * (w + 1), 1 is for line separator
|
417
420
|
tokenized_image = [self.image_token_id] * h * (w + 1)
|
418
|
-
# add a
|
421
|
+
# add a separator between global and local views
|
419
422
|
tokenized_image += [self.image_token_id]
|
420
423
|
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
421
424
|
tokenized_image += (
|
@@ -10,7 +10,7 @@ class DeviceConfig:
|
|
10
10
|
device: Optional[torch.device]
|
11
11
|
|
12
12
|
def __init__(self, device: str = "cuda") -> None:
|
13
|
-
if device in ["cuda", "xpu", "hpu", "cpu"]:
|
13
|
+
if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
|
14
14
|
self.device_type = device
|
15
15
|
else:
|
16
16
|
raise RuntimeError(f"Not supported device type: {device}")
|