sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. sglang/bench_offline_throughput.py +4 -2
  2. sglang/bench_one_batch.py +3 -13
  3. sglang/bench_one_batch_server.py +143 -15
  4. sglang/bench_serving.py +158 -8
  5. sglang/compile_deep_gemm.py +1 -1
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +119 -75
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +5 -2
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/internvl.py +696 -0
  13. sglang/srt/configs/janus_pro.py +3 -0
  14. sglang/srt/configs/model_config.py +18 -0
  15. sglang/srt/constrained/base_grammar_backend.py +55 -72
  16. sglang/srt/constrained/llguidance_backend.py +25 -21
  17. sglang/srt/constrained/outlines_backend.py +27 -26
  18. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  19. sglang/srt/constrained/xgrammar_backend.py +71 -53
  20. sglang/srt/conversation.py +78 -46
  21. sglang/srt/disaggregation/base/conn.py +1 -0
  22. sglang/srt/disaggregation/decode.py +11 -3
  23. sglang/srt/disaggregation/fake/conn.py +1 -1
  24. sglang/srt/disaggregation/mini_lb.py +74 -23
  25. sglang/srt/disaggregation/mooncake/conn.py +236 -138
  26. sglang/srt/disaggregation/nixl/conn.py +242 -71
  27. sglang/srt/disaggregation/prefill.py +7 -4
  28. sglang/srt/disaggregation/utils.py +51 -2
  29. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
  30. sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
  31. sglang/srt/distributed/device_communicators/pynccl.py +2 -1
  32. sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
  33. sglang/srt/distributed/parallel_state.py +22 -1
  34. sglang/srt/entrypoints/engine.py +31 -4
  35. sglang/srt/entrypoints/http_server.py +45 -3
  36. sglang/srt/entrypoints/verl_engine.py +3 -2
  37. sglang/srt/function_call_parser.py +2 -2
  38. sglang/srt/hf_transformers_utils.py +20 -1
  39. sglang/srt/layers/attention/flashattention_backend.py +147 -51
  40. sglang/srt/layers/attention/flashinfer_backend.py +23 -13
  41. sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
  42. sglang/srt/layers/attention/merge_state.py +46 -0
  43. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  44. sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
  45. sglang/srt/layers/attention/utils.py +4 -2
  46. sglang/srt/layers/attention/vision.py +290 -163
  47. sglang/srt/layers/dp_attention.py +71 -21
  48. sglang/srt/layers/layernorm.py +1 -1
  49. sglang/srt/layers/logits_processor.py +46 -11
  50. sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
  51. sglang/srt/layers/moe/ep_moe/layer.py +121 -2
  52. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  56. sglang/srt/layers/moe/topk.py +1 -1
  57. sglang/srt/layers/quantization/__init__.py +1 -1
  58. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  59. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
  60. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
  61. sglang/srt/layers/quantization/deep_gemm.py +77 -71
  62. sglang/srt/layers/quantization/fp8.py +110 -97
  63. sglang/srt/layers/quantization/fp8_kernel.py +81 -62
  64. sglang/srt/layers/quantization/fp8_utils.py +71 -23
  65. sglang/srt/layers/quantization/int8_kernel.py +2 -2
  66. sglang/srt/layers/quantization/kv_cache.py +3 -10
  67. sglang/srt/layers/quantization/utils.py +0 -5
  68. sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
  69. sglang/srt/layers/sampler.py +0 -4
  70. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  71. sglang/srt/lora/lora_manager.py +11 -14
  72. sglang/srt/lora/mem_pool.py +4 -4
  73. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  74. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  75. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  76. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  77. sglang/srt/lora/utils.py +1 -1
  78. sglang/srt/managers/cache_controller.py +115 -119
  79. sglang/srt/managers/data_parallel_controller.py +3 -3
  80. sglang/srt/managers/detokenizer_manager.py +21 -8
  81. sglang/srt/managers/io_struct.py +13 -1
  82. sglang/srt/managers/mm_utils.py +1 -1
  83. sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
  84. sglang/srt/managers/multimodal_processors/internvl.py +232 -0
  85. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  86. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  87. sglang/srt/managers/schedule_batch.py +93 -23
  88. sglang/srt/managers/schedule_policy.py +11 -8
  89. sglang/srt/managers/scheduler.py +140 -100
  90. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  91. sglang/srt/managers/tokenizer_manager.py +157 -47
  92. sglang/srt/managers/tp_worker.py +21 -21
  93. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  94. sglang/srt/mem_cache/chunk_cache.py +2 -0
  95. sglang/srt/mem_cache/memory_pool.py +4 -2
  96. sglang/srt/metrics/collector.py +312 -37
  97. sglang/srt/model_executor/cuda_graph_runner.py +10 -11
  98. sglang/srt/model_executor/forward_batch_info.py +1 -1
  99. sglang/srt/model_executor/model_runner.py +57 -41
  100. sglang/srt/model_loader/loader.py +18 -11
  101. sglang/srt/models/clip.py +4 -4
  102. sglang/srt/models/deepseek_janus_pro.py +3 -3
  103. sglang/srt/models/deepseek_nextn.py +1 -20
  104. sglang/srt/models/deepseek_v2.py +77 -39
  105. sglang/srt/models/gemma3_mm.py +1 -1
  106. sglang/srt/models/internlm2.py +3 -0
  107. sglang/srt/models/internvl.py +670 -0
  108. sglang/srt/models/llama.py +3 -1
  109. sglang/srt/models/llama4.py +58 -13
  110. sglang/srt/models/llava.py +248 -5
  111. sglang/srt/models/minicpmv.py +1 -1
  112. sglang/srt/models/mixtral.py +98 -34
  113. sglang/srt/models/mllama.py +1 -1
  114. sglang/srt/models/phi3_small.py +16 -2
  115. sglang/srt/models/pixtral.py +467 -0
  116. sglang/srt/models/qwen2_5_vl.py +8 -4
  117. sglang/srt/models/qwen2_vl.py +4 -4
  118. sglang/srt/models/roberta.py +1 -1
  119. sglang/srt/models/torch_native_llama.py +1 -1
  120. sglang/srt/models/xiaomi_mimo.py +171 -0
  121. sglang/srt/openai_api/adapter.py +52 -42
  122. sglang/srt/openai_api/protocol.py +20 -16
  123. sglang/srt/reasoning_parser.py +1 -1
  124. sglang/srt/sampling/custom_logit_processor.py +18 -3
  125. sglang/srt/sampling/sampling_batch_info.py +2 -2
  126. sglang/srt/sampling/sampling_params.py +2 -0
  127. sglang/srt/server_args.py +64 -10
  128. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  129. sglang/srt/speculative/eagle_utils.py +7 -7
  130. sglang/srt/speculative/eagle_worker.py +22 -19
  131. sglang/srt/utils.py +41 -6
  132. sglang/test/few_shot_gsm8k.py +2 -2
  133. sglang/test/few_shot_gsm8k_engine.py +2 -2
  134. sglang/test/run_eval.py +2 -2
  135. sglang/test/runners.py +8 -1
  136. sglang/test/send_one.py +13 -3
  137. sglang/test/simple_eval_common.py +1 -1
  138. sglang/test/simple_eval_humaneval.py +1 -1
  139. sglang/test/test_block_fp8.py +2 -2
  140. sglang/test/test_deepep_utils.py +219 -0
  141. sglang/test/test_programs.py +5 -5
  142. sglang/test/test_utils.py +92 -15
  143. sglang/utils.py +1 -1
  144. sglang/version.py +1 -1
  145. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
  146. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
  147. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
  148. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  149. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
  150. {sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,157 @@
1
+ import argparse
2
+ import asyncio
3
+ import os
4
+ import pickle
5
+ from pathlib import Path
6
+ from typing import List
7
+
8
+ import openai
9
+ import torch
10
+ from bert_score import BERTScorer
11
+ from datasets import load_dataset
12
+ from tqdm import tqdm
13
+
14
+
15
+ def get_client(api_url: str) -> openai.AsyncOpenAI:
16
+ if os.getenv("OPENAI_API_KEY") is None:
17
+ os.environ["OPENAI_API_KEY"] = "EMPTY"
18
+ return openai.AsyncOpenAI(base_url=api_url)
19
+
20
+
21
+ def get_dataset():
22
+ return load_dataset("bigai-nlco/LooGLE", "longdep_qa", split="test")
23
+
24
+
25
+ async def fetch_response(
26
+ client: openai.AsyncOpenAI,
27
+ context: str,
28
+ question: str,
29
+ semaphore: asyncio.Semaphore,
30
+ index: int,
31
+ model: str,
32
+ output_dir: Path,
33
+ ):
34
+ output_file = output_dir / f"response_{index}.pkl"
35
+ if output_file.exists():
36
+ return
37
+
38
+ prompt = (
39
+ "Please answer the question based on the long texts below.\n"
40
+ f"{context}\n"
41
+ f"Question: {question}\n"
42
+ "Answer:"
43
+ )
44
+ messages = [
45
+ {"role": "system", "content": "You are a helpful assistant."},
46
+ {"role": "user", "content": prompt},
47
+ ]
48
+
49
+ async with semaphore:
50
+ try:
51
+ response = await client.chat.completions.create(
52
+ model=model,
53
+ messages=messages,
54
+ temperature=0.0,
55
+ max_tokens=512,
56
+ )
57
+ except openai.BadRequestError as e:
58
+ with open(output_file, "wb") as f:
59
+ pickle.dump({"error": str(e)}, f)
60
+ return
61
+
62
+ with open(output_file, "wb") as f:
63
+ pickle.dump(response, f)
64
+
65
+
66
+ async def benchmark(args):
67
+ dataset = get_dataset()
68
+ output_dir = Path(args.output_dir)
69
+ output_dir.mkdir(parents=True, exist_ok=True)
70
+
71
+ client = get_client(args.api_url)
72
+ semaphore = asyncio.Semaphore(args.max_concurrency)
73
+
74
+ tasks: List[asyncio.Task] = []
75
+ for idx, ex in enumerate(dataset):
76
+ tasks.append(
77
+ asyncio.create_task(
78
+ fetch_response(
79
+ client,
80
+ ex["context"],
81
+ ex["question"],
82
+ semaphore,
83
+ idx,
84
+ args.model,
85
+ output_dir,
86
+ )
87
+ )
88
+ )
89
+
90
+ for _ in tqdm(
91
+ asyncio.as_completed(tasks), total=len(tasks), desc="Running benchmark"
92
+ ):
93
+ await _
94
+
95
+
96
+ def analyse(args):
97
+ dataset = get_dataset()
98
+ output_dir = Path(args.output_dir)
99
+
100
+ device = "cuda" if torch.cuda.is_available() else "cpu"
101
+ scorer = BERTScorer(lang="en", device=device)
102
+
103
+ hyps: List[str] = []
104
+ refs: List[str] = []
105
+ for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
106
+ pkl_file = output_dir / f"response_{idx}.pkl"
107
+ if not pkl_file.exists():
108
+ raise FileNotFoundError(pkl_file)
109
+
110
+ response = pickle.load(open(pkl_file, "rb"))
111
+ if isinstance(response, dict) and "error" in response:
112
+ continue
113
+
114
+ hyps.append(response.choices[0].message.content.strip())
115
+ refs.append(ex["answer"])
116
+
117
+ if not hyps:
118
+ print("No valid responses to score!")
119
+ return
120
+
121
+ batch_size = 64
122
+ all_f1: List[float] = []
123
+ for i in tqdm(range(0, len(hyps), batch_size), desc="Scoring batches"):
124
+ h_batch = hyps[i : i + batch_size]
125
+ r_batch = refs[i : i + batch_size]
126
+ _, _, f1_scores = scorer.score(h_batch, r_batch, verbose=False)
127
+ all_f1.extend([float(x) for x in f1_scores])
128
+
129
+ avg = sum(all_f1) / len(all_f1)
130
+ print(f"Average BERTScore (F1): {avg:.2%}")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ parser = argparse.ArgumentParser(
135
+ description="Run benchmark and evaluation in one go."
136
+ )
137
+ parser.add_argument(
138
+ "--api-url",
139
+ default="http://127.0.0.1:30000/v1",
140
+ help="OpenAI‑compatible API base URL",
141
+ )
142
+ parser.add_argument(
143
+ "--model",
144
+ default="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
145
+ help="Model name or ID, only used for model name",
146
+ )
147
+ parser.add_argument(
148
+ "--max-concurrency", type=int, default=144, help="Maximum concurrent requests"
149
+ )
150
+ parser.add_argument(
151
+ "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
152
+ )
153
+ args = parser.parse_args()
154
+
155
+ asyncio.run(benchmark(args))
156
+
157
+ analyse(args)
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from dataclasses import dataclass
2
3
  from enum import Enum, auto
3
4
  from typing import Callable, Dict, List, Tuple
@@ -71,9 +72,9 @@ def get_chat_template(name):
71
72
 
72
73
  def get_chat_template_by_model_path(model_path):
73
74
  for matching_func in matching_function_registry:
74
- template = matching_func(model_path)
75
- if template is not None:
76
- return template
75
+ template_name = matching_func(model_path)
76
+ if template_name is not None:
77
+ return get_chat_template(template_name)
77
78
  return get_chat_template("default")
78
79
 
79
80
 
@@ -193,6 +194,21 @@ register_chat_template(
193
194
  )
194
195
  )
195
196
 
197
+ # Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
198
+ register_chat_template(
199
+ ChatTemplate(
200
+ name="mistral",
201
+ default_system_prompt=None,
202
+ role_prefix_and_suffix={
203
+ "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
204
+ "user": ("[INST] ", " [/INST]"),
205
+ "assistant": ("", " </s><s>"),
206
+ },
207
+ stop_str=("</s>",),
208
+ image_token="[IMG]",
209
+ )
210
+ )
211
+
196
212
  register_chat_template(
197
213
  ChatTemplate(
198
214
  name="llama-3-instruct",
@@ -270,6 +286,29 @@ register_chat_template(
270
286
  )
271
287
  )
272
288
 
289
+ register_chat_template(
290
+ ChatTemplate(
291
+ name="janus",
292
+ default_system_prompt=None,
293
+ role_prefix_and_suffix={
294
+ "system": (
295
+ "",
296
+ "",
297
+ ),
298
+ "user": (
299
+ "<|User|>",
300
+ "",
301
+ ),
302
+ "assistant": (
303
+ "<|Assistant|>",
304
+ "<|end▁of▁sentence|>",
305
+ ),
306
+ },
307
+ stop_str=("<|end▁of▁sentence|>",),
308
+ image_token="<image_placeholder>\n",
309
+ )
310
+ )
311
+
273
312
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
274
313
  register_chat_template(
275
314
  ChatTemplate(
@@ -395,6 +434,20 @@ register_chat_template(
395
434
  )
396
435
  )
397
436
 
437
+ # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
438
+ register_chat_template(
439
+ ChatTemplate(
440
+ name="internvl-2-5",
441
+ default_system_prompt="你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
442
+ role_prefix_and_suffix={
443
+ "system": ("<|im_start|>system\n", "<|im_end|>\n"),
444
+ "user": ("<|im_start|>user\n", "<|im_end|>\n"),
445
+ "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
446
+ },
447
+ stop_str=["<|im_end|>", "<|action_end|>"],
448
+ )
449
+ )
450
+
398
451
  register_chat_template(
399
452
  ChatTemplate(
400
453
  name="granite-3-instruct",
@@ -442,127 +495,118 @@ register_chat_template(
442
495
 
443
496
  @register_chat_template_matching_function
444
497
  def match_deepseek(model_path: str):
445
- if (
446
- "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
447
- ) and "base" not in model_path.lower():
448
- return get_chat_template("deepseek-v3")
498
+ if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
499
+ r"base", model_path, re.IGNORECASE
500
+ ):
501
+ return "deepseek-v3"
449
502
 
450
503
 
451
504
  @register_chat_template_matching_function
452
505
  def match_deepseek_janus_pro(model_path: str):
453
- if "janus" in model_path.lower():
454
- return get_chat_template("janus-pro")
506
+ if re.search(r"janus", model_path, re.IGNORECASE):
507
+ return "janus-pro"
455
508
 
456
509
 
457
510
  @register_chat_template_matching_function
458
511
  def match_dbrx(model_path: str):
459
- if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
460
- return get_chat_template("dbrx-instruct")
512
+ if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
513
+ r"instruct", model_path, re.IGNORECASE
514
+ ):
515
+ return "dbrx-instruct"
461
516
 
462
517
 
463
518
  @register_chat_template_matching_function
464
519
  def match_vicuna(model_path: str):
465
- if "vicuna" in model_path.lower():
466
- return get_chat_template("vicuna_v1.1")
467
- if "llava-v1.5" in model_path.lower():
468
- return get_chat_template("vicuna_v1.1")
469
- if "llava-next-video-7b" in model_path.lower():
470
- return get_chat_template("vicuna_v1.1")
520
+ if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
521
+ return "vicuna_v1.1"
471
522
 
472
523
 
473
524
  @register_chat_template_matching_function
474
525
  def match_llama2_chat(model_path: str):
475
- model_path = model_path.lower()
476
- if "llama-2" in model_path and "chat" in model_path:
477
- return get_chat_template("llama-2-chat")
478
- if (
479
- "mistral" in model_path or "mixtral" in model_path
480
- ) and "instruct" in model_path:
481
- return get_chat_template("llama-2-chat")
482
- if "codellama" in model_path and "instruct" in model_path:
483
- return get_chat_template("llama-2-chat")
526
+ if re.search(
527
+ r"llama-2.*chat|codellama.*instruct",
528
+ model_path,
529
+ re.IGNORECASE,
530
+ ):
531
+ return "llama-2-chat"
532
+
533
+
534
+ @register_chat_template_matching_function
535
+ def match_mistral(model_path: str):
536
+ if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
537
+ return "mistral"
484
538
 
485
539
 
486
540
  @register_chat_template_matching_function
487
541
  def match_llama3_instruct(model_path: str):
488
- model_path = model_path.lower()
489
- if "llama-3" in model_path and "instruct" in model_path:
490
- return get_chat_template("llama-3-instruct")
542
+ if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
543
+ return "llama-3-instruct"
491
544
 
492
545
 
493
546
  @register_chat_template_matching_function
494
547
  def match_chat_ml(model_path: str):
495
- # import pdb;pdb.set_trace()
496
- model_path = model_path.lower()
497
- if "tinyllama" in model_path:
498
- return get_chat_template("chatml")
499
- # Now the suffix for qwen2 chat model is "instruct"
500
- if "qwen" in model_path and "vl" in model_path:
501
- return get_chat_template("qwen2-vl")
502
- if "qwen" in model_path:
503
- if "vl" in model_path:
504
- return get_chat_template("qwen2-vl")
505
- if ("chat" in model_path or "instruct" in model_path) and (
506
- "llava" not in model_path
507
- ):
508
- return get_chat_template("qwen")
509
- if (
510
- "llava-v1.6-34b" in model_path
511
- or "llava-v1.6-yi-34b" in model_path
512
- or "llava-next-video-34b" in model_path
513
- or "llava-onevision-qwen2" in model_path
548
+ if re.search(r"tinyllama", model_path, re.IGNORECASE):
549
+ return "chatml"
550
+ if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
551
+ return "qwen2-vl"
552
+ if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
553
+ r"llava", model_path, re.IGNORECASE
554
+ ):
555
+ return "qwen"
556
+ if re.search(
557
+ r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
558
+ model_path,
559
+ re.IGNORECASE,
514
560
  ):
515
- return get_chat_template("chatml-llava")
561
+ return "chatml-llava"
516
562
 
517
563
 
518
564
  @register_chat_template_matching_function
519
565
  def match_chat_yi(model_path: str):
520
- model_path = model_path.lower()
521
- if "yi-vl" in model_path and "llava" not in model_path:
522
- return get_chat_template("yi-vl")
523
- elif "yi-1.5" in model_path and "chat" in model_path:
524
- return get_chat_template("yi-1.5")
566
+ if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
567
+ r"llava", model_path, re.IGNORECASE
568
+ ):
569
+ return "yi-vl"
570
+ elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
571
+ return "yi-1.5"
525
572
 
526
573
 
527
574
  @register_chat_template_matching_function
528
575
  def match_gemma_it(model_path: str):
529
- model_path = model_path.lower()
530
- if "gemma" in model_path and "it" in model_path:
531
- return get_chat_template("gemma-it")
576
+ if re.search(r"gemma.*it", model_path, re.IGNORECASE):
577
+ return "gemma-it"
532
578
 
533
579
 
534
580
  @register_chat_template_matching_function
535
581
  def match_openbmb_minicpm(model_path: str):
536
- model_path = model_path.lower()
537
- if "minicpm-v" in model_path:
538
- return get_chat_template("minicpmv")
539
- elif "minicpm-o" in model_path:
540
- return get_chat_template("minicpmo")
582
+ if re.search(r"minicpm-v", model_path, re.IGNORECASE):
583
+ return "minicpmv"
584
+ elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
585
+ return "minicpmo"
541
586
 
542
587
 
543
588
  @register_chat_template_matching_function
544
589
  def match_c4ai_command_r(model_path: str):
545
- model_path = model_path.lower()
546
- if "c4ai-command-r" in model_path:
547
- return get_chat_template("c4ai-command-r")
590
+ if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
591
+ return "c4ai-command-r"
548
592
 
549
593
 
550
594
  @register_chat_template_matching_function
551
595
  def match_granite_instruct(model_path: str):
552
- model_path = model_path.lower()
553
- # When future versions of Granite are released, this code may
554
- # need to be updated. For now, assume that the Granite 3.0
555
- # template works across the board.
556
- if "granite" in model_path and "instruct" in model_path:
557
- return get_chat_template("granite-3-instruct")
596
+ if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
597
+ return "granite-3-instruct"
558
598
 
559
599
 
560
600
  @register_chat_template_matching_function
561
601
  def match_gemma3_instruct(model_path: str):
562
- model_path = model_path.lower()
563
- if "gemma-3" in model_path and "1b" not in model_path:
564
- # gemma-3-1b-it is completion model
565
- return get_chat_template("gemma-it")
602
+ if re.search(r"gemma-3", model_path, re.IGNORECASE):
603
+ return "gemma-it"
604
+
605
+
606
+ @register_chat_template_matching_function
607
+ def match_internvl_chat(model_path: str):
608
+ if re.search(r"internvl2_5", model_path, re.IGNORECASE):
609
+ return "internvl-2-5"
566
610
 
567
611
 
568
612
  if __name__ == "__main__":
sglang/lang/tracer.py CHANGED
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
38
38
  with TracingScope(tracer):
39
39
  tracer.ret_value = program.func(tracer, **arguments)
40
40
  except (StopTracing, TypeError, AttributeError):
41
- # Some exceptions may not be catched
41
+ # Some exceptions may not be caught
42
42
  pass
43
43
 
44
44
  # Run and cache prefix
@@ -27,7 +27,7 @@ completion_template_name = None
27
27
 
28
28
 
29
29
  class FimPosition:
30
- """Postion of fim middle token."""
30
+ """Position of fim middle token."""
31
31
 
32
32
  MIDDLE = auto()
33
33
  END = auto()
@@ -48,6 +48,9 @@ class DictOutput(object):
48
48
  def __getitem__(self, item):
49
49
  return self.__dict__[item]
50
50
 
51
+ def __contains__(self, key):
52
+ return key in self.__dict__
53
+
51
54
  def __setitem__(self, key, value):
52
55
  self.__dict__[key] = value
53
56
 
@@ -413,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
413
416
  h = w = math.ceil(
414
417
  (self.image_size // self.patch_size) / self.downsample_ratio
415
418
  )
416
- # global views tokens h * (w + 1), 1 is for line seperator
419
+ # global views tokens h * (w + 1), 1 is for line separator
417
420
  tokenized_image = [self.image_token_id] * h * (w + 1)
418
- # add a seperator between global and local views
421
+ # add a separator between global and local views
419
422
  tokenized_image += [self.image_token_id]
420
423
  # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
421
424
  tokenized_image += (
@@ -10,7 +10,7 @@ class DeviceConfig:
10
10
  device: Optional[torch.device]
11
11
 
12
12
  def __init__(self, device: str = "cuda") -> None:
13
- if device in ["cuda", "xpu", "hpu", "cpu"]:
13
+ if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
14
14
  self.device_type = device
15
15
  else:
16
16
  raise RuntimeError(f"Not supported device type: {device}")