sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +3 -1
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +667 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +63 -11
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/parallel_state.py +10 -3
  34. sglang/srt/entrypoints/engine.py +55 -5
  35. sglang/srt/entrypoints/http_server.py +71 -12
  36. sglang/srt/function_call_parser.py +164 -54
  37. sglang/srt/hf_transformers_utils.py +28 -3
  38. sglang/srt/layers/activation.py +4 -2
  39. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +295 -0
  41. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  42. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  43. sglang/srt/layers/attention/triton_backend.py +171 -38
  44. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  45. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  46. sglang/srt/layers/attention/utils.py +53 -0
  47. sglang/srt/layers/attention/vision.py +9 -28
  48. sglang/srt/layers/dp_attention.py +62 -23
  49. sglang/srt/layers/elementwise.py +411 -0
  50. sglang/srt/layers/layernorm.py +24 -2
  51. sglang/srt/layers/linear.py +17 -5
  52. sglang/srt/layers/logits_processor.py +26 -7
  53. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  54. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  55. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  56. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  63. sglang/srt/layers/moe/router.py +342 -0
  64. sglang/srt/layers/moe/topk.py +31 -18
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +184 -126
  67. sglang/srt/layers/quantization/base_config.py +5 -0
  68. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  69. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  70. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  72. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  75. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  76. sglang/srt/layers/quantization/fp8.py +76 -34
  77. sglang/srt/layers/quantization/fp8_kernel.py +24 -8
  78. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  79. sglang/srt/layers/quantization/gptq.py +36 -9
  80. sglang/srt/layers/quantization/kv_cache.py +98 -0
  81. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  82. sglang/srt/layers/quantization/utils.py +153 -0
  83. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  84. sglang/srt/layers/rotary_embedding.py +66 -87
  85. sglang/srt/layers/sampler.py +1 -1
  86. sglang/srt/lora/layers.py +68 -0
  87. sglang/srt/lora/lora.py +2 -22
  88. sglang/srt/lora/lora_manager.py +47 -23
  89. sglang/srt/lora/mem_pool.py +110 -51
  90. sglang/srt/lora/utils.py +12 -1
  91. sglang/srt/managers/cache_controller.py +4 -5
  92. sglang/srt/managers/data_parallel_controller.py +31 -9
  93. sglang/srt/managers/expert_distribution.py +81 -0
  94. sglang/srt/managers/io_struct.py +39 -3
  95. sglang/srt/managers/mm_utils.py +373 -0
  96. sglang/srt/managers/multimodal_processor.py +68 -0
  97. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  98. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  99. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  100. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  101. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  102. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  103. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  104. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  105. sglang/srt/managers/schedule_batch.py +134 -31
  106. sglang/srt/managers/scheduler.py +325 -38
  107. sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
  108. sglang/srt/managers/session_controller.py +1 -1
  109. sglang/srt/managers/tokenizer_manager.py +59 -23
  110. sglang/srt/managers/tp_worker.py +1 -1
  111. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  112. sglang/srt/managers/utils.py +6 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +27 -8
  114. sglang/srt/mem_cache/memory_pool.py +258 -98
  115. sglang/srt/mem_cache/paged_allocator.py +2 -2
  116. sglang/srt/mem_cache/radix_cache.py +4 -4
  117. sglang/srt/model_executor/cuda_graph_runner.py +85 -28
  118. sglang/srt/model_executor/forward_batch_info.py +81 -15
  119. sglang/srt/model_executor/model_runner.py +70 -6
  120. sglang/srt/model_loader/loader.py +160 -2
  121. sglang/srt/model_loader/weight_utils.py +45 -0
  122. sglang/srt/models/deepseek_janus_pro.py +29 -86
  123. sglang/srt/models/deepseek_nextn.py +22 -10
  124. sglang/srt/models/deepseek_v2.py +326 -192
  125. sglang/srt/models/deepseek_vl2.py +358 -0
  126. sglang/srt/models/gemma3_causal.py +684 -0
  127. sglang/srt/models/gemma3_mm.py +462 -0
  128. sglang/srt/models/grok.py +374 -119
  129. sglang/srt/models/llama.py +47 -7
  130. sglang/srt/models/llama_eagle.py +1 -0
  131. sglang/srt/models/llama_eagle3.py +196 -0
  132. sglang/srt/models/llava.py +3 -3
  133. sglang/srt/models/llavavid.py +3 -3
  134. sglang/srt/models/minicpmo.py +1995 -0
  135. sglang/srt/models/minicpmv.py +62 -137
  136. sglang/srt/models/mllama.py +4 -4
  137. sglang/srt/models/phi3_small.py +1 -1
  138. sglang/srt/models/qwen2.py +3 -0
  139. sglang/srt/models/qwen2_5_vl.py +68 -146
  140. sglang/srt/models/qwen2_classification.py +75 -0
  141. sglang/srt/models/qwen2_moe.py +9 -1
  142. sglang/srt/models/qwen2_vl.py +25 -63
  143. sglang/srt/openai_api/adapter.py +145 -47
  144. sglang/srt/openai_api/protocol.py +23 -2
  145. sglang/srt/sampling/sampling_batch_info.py +1 -1
  146. sglang/srt/sampling/sampling_params.py +6 -6
  147. sglang/srt/server_args.py +104 -14
  148. sglang/srt/speculative/build_eagle_tree.py +7 -347
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  150. sglang/srt/speculative/eagle_utils.py +208 -252
  151. sglang/srt/speculative/eagle_worker.py +139 -53
  152. sglang/srt/speculative/spec_info.py +6 -1
  153. sglang/srt/torch_memory_saver_adapter.py +22 -0
  154. sglang/srt/utils.py +182 -21
  155. sglang/test/__init__.py +0 -0
  156. sglang/test/attention/__init__.py +0 -0
  157. sglang/test/attention/test_flashattn_backend.py +312 -0
  158. sglang/test/runners.py +2 -0
  159. sglang/test/test_activation.py +2 -1
  160. sglang/test/test_block_fp8.py +5 -4
  161. sglang/test/test_block_fp8_ep.py +2 -1
  162. sglang/test/test_dynamic_grad_mode.py +58 -0
  163. sglang/test/test_layernorm.py +3 -2
  164. sglang/test/test_utils.py +55 -4
  165. sglang/utils.py +31 -0
  166. sglang/version.py +1 -1
  167. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
  168. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
  169. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
  170. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  171. sglang/srt/managers/image_processor.py +0 -55
  172. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  173. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  174. sglang/srt/managers/multi_modality_padding.py +0 -134
  175. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
  176. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -32,6 +32,7 @@ from sglang.lang.choices import (
32
32
  )
33
33
  from sglang.utils import LazyImport
34
34
 
35
+ ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
35
36
  Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
36
37
  LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
37
38
  OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
@@ -67,6 +68,7 @@ __all__ = [
67
68
  "greedy_token_selection",
68
69
  "token_length_normalized",
69
70
  "unconditional_likelihood_normalized",
71
+ "ServerArgs",
70
72
  "Anthropic",
71
73
  "LiteLLM",
72
74
  "OpenAI",
sglang/api.py CHANGED
@@ -75,6 +75,7 @@ def gen(
75
75
  name: Optional[str] = None,
76
76
  max_tokens: Optional[int] = None,
77
77
  min_tokens: Optional[int] = None,
78
+ n: Optional[int] = None,
78
79
  stop: Optional[Union[str, List[str]]] = None,
79
80
  stop_token_ids: Optional[List[int]] = None,
80
81
  temperature: Optional[float] = None,
@@ -115,6 +116,7 @@ def gen(
115
116
  name,
116
117
  max_tokens,
117
118
  min_tokens,
119
+ n,
118
120
  stop,
119
121
  stop_token_ids,
120
122
  temperature,
@@ -137,6 +139,7 @@ def gen(
137
139
  def gen_int(
138
140
  name: Optional[str] = None,
139
141
  max_tokens: Optional[int] = None,
142
+ n: Optional[int] = None,
140
143
  stop: Optional[Union[str, List[str]]] = None,
141
144
  stop_token_ids: Optional[List[int]] = None,
142
145
  temperature: Optional[float] = None,
@@ -155,6 +158,7 @@ def gen_int(
155
158
  name,
156
159
  max_tokens,
157
160
  None,
161
+ n,
158
162
  stop,
159
163
  stop_token_ids,
160
164
  temperature,
@@ -176,6 +180,7 @@ def gen_int(
176
180
  def gen_string(
177
181
  name: Optional[str] = None,
178
182
  max_tokens: Optional[int] = None,
183
+ n: Optional[int] = None,
179
184
  stop: Optional[Union[str, List[str]]] = None,
180
185
  stop_token_ids: Optional[List[int]] = None,
181
186
  temperature: Optional[float] = None,
@@ -194,6 +199,7 @@ def gen_string(
194
199
  name,
195
200
  max_tokens,
196
201
  None,
202
+ n,
197
203
  stop,
198
204
  stop_token_ids,
199
205
  temperature,
sglang/bench_one_batch.py CHANGED
@@ -117,7 +117,7 @@ class BenchArgs:
117
117
 
118
118
  @classmethod
119
119
  def from_cli_args(cls, args: argparse.Namespace):
120
- # use the default value's type to case the args into correct types.
120
+ # use the default value's type to cast the args into correct types.
121
121
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
122
122
  return cls(
123
123
  **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
@@ -57,7 +57,7 @@ class BenchArgs:
57
57
 
58
58
  @classmethod
59
59
  def from_cli_args(cls, args: argparse.Namespace):
60
- # use the default value's type to case the args into correct types.
60
+ # use the default value's type to cast the args into correct types.
61
61
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
62
62
  return cls(
63
63
  **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
sglang/bench_serving.py CHANGED
@@ -128,7 +128,7 @@ async def async_request_trt_llm(
128
128
  timestamp = time.perf_counter()
129
129
  # First token
130
130
  if ttft == 0.0:
131
- ttft = time.perf_counter() - st
131
+ ttft = timestamp - st
132
132
  output.ttft = ttft
133
133
 
134
134
  # Decoding phase
@@ -501,6 +501,7 @@ def get_dataset(args, tokenizer):
501
501
  question_len=args.gsp_question_len,
502
502
  output_len=args.gsp_output_len,
503
503
  tokenizer=tokenizer,
504
+ args=args,
504
505
  )
505
506
  else:
506
507
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
@@ -788,6 +789,7 @@ def sample_generated_shared_prefix_requests(
788
789
  question_len: int,
789
790
  output_len: int,
790
791
  tokenizer: PreTrainedTokenizerBase,
792
+ args: argparse.Namespace,
791
793
  ) -> List[Tuple[str, int, int]]:
792
794
  """Generate benchmark requests with shared system prompts using random tokens and caching."""
793
795
  cache_path = get_gen_prefix_cache_path(args, tokenizer)
sglang/check_env.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """Check environment configurations and dependency versions."""
2
2
 
3
- import importlib
3
+ import importlib.metadata
4
4
  import os
5
5
  import resource
6
6
  import subprocess
@@ -59,9 +59,8 @@ def get_package_versions(packages):
59
59
  for package in packages:
60
60
  package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
61
61
  try:
62
- module = importlib.import_module(package_name)
63
- if hasattr(module, "__version__"):
64
- versions[package_name] = module.__version__
62
+ version = importlib.metadata.version(package_name)
63
+ versions[package_name] = version
65
64
  except ModuleNotFoundError:
66
65
  versions[package_name] = "Module Not Found"
67
66
  return versions
@@ -165,6 +165,7 @@ class OpenAI(BaseBackend):
165
165
  kwargs.pop("max_tokens", None)
166
166
  else:
167
167
  kwargs.pop("max_completion_tokens", None)
168
+
168
169
  comp = openai_completion(
169
170
  client=self.client,
170
171
  token_usage=self.token_usage,
@@ -173,13 +174,13 @@ class OpenAI(BaseBackend):
173
174
  prompt=prompt,
174
175
  **kwargs,
175
176
  )
177
+ # Keep the returned list (or string) as is.
176
178
  elif sampling_params.dtype in [str, "str", "string"]:
177
179
  assert (
178
180
  not self.is_chat_model
179
181
  ), "constrained type not supported on chat model"
180
182
  kwargs = sampling_params.to_openai_kwargs()
181
183
  kwargs.pop("stop")
182
-
183
184
  comp = openai_completion(
184
185
  client=self.client,
185
186
  token_usage=self.token_usage,
@@ -189,7 +190,11 @@ class OpenAI(BaseBackend):
189
190
  stop='"',
190
191
  **kwargs,
191
192
  )
192
- comp = '"' + comp + '"'
193
+ # Wrap each element in quotes if we have a list.
194
+ if isinstance(comp, list):
195
+ comp = ['"' + x + '"' for x in comp]
196
+ else:
197
+ comp = '"' + comp + '"'
193
198
  elif sampling_params.dtype in [int, "int"]:
194
199
  assert (
195
200
  not self.is_chat_model
@@ -206,6 +211,7 @@ class OpenAI(BaseBackend):
206
211
  stop=[" "],
207
212
  **kwargs,
208
213
  )
214
+ # Leave as a list if that's what is returned.
209
215
  else:
210
216
  raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
211
217
 
@@ -254,7 +260,9 @@ class OpenAI(BaseBackend):
254
260
  prompt=s.messages_,
255
261
  **self.spec_kwargs,
256
262
  )
257
- if self.spec_pattern_match(comp):
263
+ # Use a string for pattern matching.
264
+ comp_for_match = comp[0] if isinstance(comp, list) else comp
265
+ if self.spec_pattern_match(comp_for_match):
258
266
  break
259
267
 
260
268
  for term in self.spec_format:
@@ -370,7 +378,7 @@ class OpenAI(BaseBackend):
370
378
 
371
379
  def openai_completion(
372
380
  client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
373
- ):
381
+ ) -> Union[str, List[str]]:
374
382
  # if "ebnf" is in kwargs, warn and remove
375
383
  if "ebnf" in kwargs:
376
384
  warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
@@ -382,13 +390,18 @@ def openai_completion(
382
390
  if "stop" in kwargs and kwargs["stop"] is None:
383
391
  kwargs.pop("stop")
384
392
  ret = client.chat.completions.create(messages=prompt, **kwargs)
385
- comp = ret.choices[0].message.content
393
+ if len(ret.choices) == 1:
394
+ comp = ret.choices[0].message.content
395
+ else:
396
+ comp = [c.message.content for c in ret.choices]
386
397
  else:
387
398
  ret = client.completions.create(prompt=prompt, **kwargs)
388
399
  if isinstance(prompt, (list, tuple)):
389
400
  comp = [c.text for c in ret.choices]
390
401
  else:
391
402
  comp = ret.choices[0].text
403
+ if len(ret.choices) > 1:
404
+ comp = [c.text for c in ret.choices]
392
405
 
393
406
  token_usage.prompt_tokens += ret.usage.prompt_tokens
394
407
  token_usage.completion_tokens += ret.usage.completion_tokens
@@ -15,6 +15,7 @@ class ChatTemplate:
15
15
  role_prefix_and_suffix: Dict[str, Tuple[str, str]]
16
16
  stop_str: List[str] = ()
17
17
  image_token: str = "<image>"
18
+ audio_token: str = "<audio>"
18
19
  style: ChatTemplateStyle = ChatTemplateStyle.PLAIN
19
20
 
20
21
  def get_prefix_and_suffix(
@@ -253,6 +254,22 @@ register_chat_template(
253
254
  )
254
255
  )
255
256
 
257
+ # https://huggingface.co/openbmb/MiniCPM-o-2_6
258
+ register_chat_template(
259
+ ChatTemplate(
260
+ name="minicpmo",
261
+ default_system_prompt=None,
262
+ role_prefix_and_suffix={
263
+ "system": ("", " "),
264
+ "user": ("user:", " "),
265
+ "assistant": ("assistant:", "</s>"),
266
+ },
267
+ stop_str=("<|im_end|>", "<|endoftext|>"),
268
+ image_token="(<image>./</image>)",
269
+ audio_token="(<audio>./</audio>)",
270
+ )
271
+ )
272
+
256
273
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
257
274
  register_chat_template(
258
275
  ChatTemplate(
@@ -474,12 +491,6 @@ def match_chat_ml(model_path: str):
474
491
  return get_chat_template("chatml-llava")
475
492
 
476
493
 
477
- @register_chat_template_matching_function
478
- def match_chat_minicpm(model_path: str):
479
- if "minicpm" in model_path:
480
- return get_chat_template("minicpmv")
481
-
482
-
483
494
  @register_chat_template_matching_function
484
495
  def match_chat_yi(model_path: str):
485
496
  model_path = model_path.lower()
@@ -499,8 +510,10 @@ def match_gemma_it(model_path: str):
499
510
  @register_chat_template_matching_function
500
511
  def match_openbmb_minicpm(model_path: str):
501
512
  model_path = model_path.lower()
502
- if "minicpm" in model_path:
513
+ if "minicpm-v" in model_path:
503
514
  return get_chat_template("minicpmv")
515
+ elif "minicpm-o" in model_path:
516
+ return get_chat_template("minicpmo")
504
517
 
505
518
 
506
519
  @register_chat_template_matching_function
@@ -520,6 +533,14 @@ def match_granite_instruct(model_path: str):
520
533
  return get_chat_template("granite-3-instruct")
521
534
 
522
535
 
536
+ @register_chat_template_matching_function
537
+ def match_gemma3_instruct(model_path: str):
538
+ model_path = model_path.lower()
539
+ if "gemma-3" in model_path and "1b" not in model_path:
540
+ # gemma-3-1b-it is completion model
541
+ return get_chat_template("gemma-it")
542
+
543
+
523
544
  if __name__ == "__main__":
524
545
  messages = [
525
546
  {"role": "system", "content": None}, # None means default
@@ -566,13 +566,13 @@ class StreamExecutor:
566
566
  def _execute_gen(self, expr: SglGen):
567
567
  sampling_params = self._resolve_sampling_params(expr.sampling_params)
568
568
  name = expr.name
569
-
570
569
  if not self.stream:
571
570
  if self.num_api_spec_tokens is None:
572
571
  comp, meta_info = self.backend.generate(
573
572
  self,
574
573
  sampling_params=sampling_params,
575
574
  )
575
+
576
576
  else:
577
577
  if self.backend.is_chat_model:
578
578
  # Speculative execution on models with only chat interface.
@@ -587,8 +587,11 @@ class StreamExecutor:
587
587
 
588
588
  else: # Speculative execution on models with completion interface
589
589
  comp, meta_info = self._spec_gen(sampling_params)
590
-
591
- self.text_ += comp
590
+ if isinstance(comp, list):
591
+ self.text_ += comp[0]
592
+ else:
593
+ assert isinstance(comp, str)
594
+ self.text_ += comp
592
595
 
593
596
  self.variables[name] = comp
594
597
  self.meta_info[name] = meta_info
@@ -747,6 +750,7 @@ class StreamExecutor:
747
750
  for item in [
748
751
  "max_new_tokens",
749
752
  "min_new_tokens",
753
+ "n",
750
754
  "stop",
751
755
  "stop_token_ids",
752
756
  "temperature",
sglang/lang/ir.py CHANGED
@@ -18,6 +18,7 @@ REGEX_STR = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg
18
18
  class SglSamplingParams:
19
19
  max_new_tokens: int = 128
20
20
  min_new_tokens: int = 0
21
+ n: int = 1
21
22
  stop: Union[str, List[str]] = ()
22
23
  stop_token_ids: Optional[List[int]] = ()
23
24
  temperature: float = 1.0
@@ -41,6 +42,7 @@ class SglSamplingParams:
41
42
  return SglSamplingParams(
42
43
  self.max_new_tokens,
43
44
  self.min_new_tokens,
45
+ self.n,
44
46
  self.stop,
45
47
  self.stop_token_ids,
46
48
  self.temperature,
@@ -64,6 +66,7 @@ class SglSamplingParams:
64
66
  return {
65
67
  "max_tokens": self.max_new_tokens,
66
68
  "max_completion_tokens": self.max_new_tokens,
69
+ "n": self.n,
67
70
  "stop": self.stop or None,
68
71
  "temperature": self.temperature,
69
72
  "top_p": self.top_p,
@@ -117,6 +120,7 @@ class SglSamplingParams:
117
120
  return {
118
121
  "max_new_tokens": self.max_new_tokens,
119
122
  "min_new_tokens": self.min_new_tokens,
123
+ "n": self.n,
120
124
  "stop": self.stop,
121
125
  "stop_token_ids": self.stop_token_ids,
122
126
  "temperature": self.temperature,
@@ -154,6 +158,7 @@ class SglFunction:
154
158
  self,
155
159
  *args,
156
160
  max_new_tokens: int = 128,
161
+ n: int = 1,
157
162
  stop: Optional[Union[str, List[str]]] = None,
158
163
  stop_token_ids: Optional[List[int]] = None,
159
164
  temperature: float = 1.0,
@@ -182,6 +187,7 @@ class SglFunction:
182
187
 
183
188
  default_sampling_para = SglSamplingParams(
184
189
  max_new_tokens=max_new_tokens,
190
+ n=n,
185
191
  stop=stop,
186
192
  stop_token_ids=stop_token_ids,
187
193
  temperature=temperature,
@@ -212,6 +218,7 @@ class SglFunction:
212
218
  batch_kwargs,
213
219
  *,
214
220
  max_new_tokens: int = 128,
221
+ n: int = 1,
215
222
  stop: Optional[Union[str, List[str]]] = None,
216
223
  stop_token_ids: Optional[List[int]] = None,
217
224
  temperature: float = 1.0,
@@ -257,6 +264,7 @@ class SglFunction:
257
264
 
258
265
  default_sampling_para = SglSamplingParams(
259
266
  max_new_tokens=max_new_tokens,
267
+ n=n,
260
268
  stop=stop,
261
269
  stop_token_ids=stop_token_ids,
262
270
  temperature=temperature,
@@ -440,6 +448,7 @@ class SglGen(SglExpr):
440
448
  name: Optional[str] = None,
441
449
  max_new_tokens: Optional[int] = None,
442
450
  min_new_tokens: Optional[int] = None,
451
+ n: Optional[int] = None,
443
452
  stop: Optional[Union[str, List[str]]] = None,
444
453
  stop_token_ids: Optional[List[int]] = None,
445
454
  temperature: Optional[float] = None,
@@ -463,6 +472,7 @@ class SglGen(SglExpr):
463
472
  self.sampling_params = SglSamplingParams(
464
473
  max_new_tokens=max_new_tokens,
465
474
  min_new_tokens=min_new_tokens,
475
+ n=n,
466
476
  stop=stop,
467
477
  stop_token_ids=stop_token_ids,
468
478
  temperature=temperature,
sglang/srt/_custom_ops.py CHANGED
@@ -10,7 +10,7 @@ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
  use_vllm_custom_allreduce = get_bool_env_var(
13
- "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
13
+ "USE_VLLM_CUSTOM_ALLREDUCE", default="false"
14
14
  )
15
15
 
16
16
  if not is_hpu():
@@ -0,0 +1,174 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """Completion templates."""
15
+
16
+
17
+ import dataclasses
18
+ import json
19
+ import logging
20
+ import os
21
+ from enum import auto
22
+
23
+ from sglang.srt.openai_api.protocol import ChatCompletionRequest
24
+
25
+ logger = logging.getLogger(__name__)
26
+ completion_template_name = None
27
+
28
+
29
+ class FimPosition:
30
+ """Postion of fim middle token."""
31
+
32
+ MIDDLE = auto()
33
+ END = auto()
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class CompletionTemplate:
38
+ """A class that manages completion prompt templates. only for code completion currently."""
39
+
40
+ # The name of this template
41
+ name: str
42
+
43
+ # the fim begin token
44
+ fim_begin_token: str
45
+
46
+ # The fim middle token
47
+ fim_middle_token: str
48
+
49
+ # The fim end token
50
+ fim_end_token: str
51
+
52
+ # The position of the fim middle token
53
+ fim_position: FimPosition
54
+
55
+
56
+ # A global registry for all completion templates
57
+ completion_templates: dict[str, CompletionTemplate] = {}
58
+
59
+
60
+ def load_completion_template_for_openai_api(completion_template_arg):
61
+ global completion_template_name
62
+
63
+ logger.info(
64
+ f"Use completion template for the OpenAI-compatible API server: {completion_template_arg}"
65
+ )
66
+
67
+ if not completion_template_exists(completion_template_arg):
68
+ if not os.path.exists(completion_template_arg):
69
+ raise RuntimeError(
70
+ f"Completion template {completion_template_arg} is not a built-in template name "
71
+ "or a valid completion template file path."
72
+ )
73
+
74
+ assert completion_template_arg.endswith(
75
+ ".json"
76
+ ), "unrecognized format of completion template file"
77
+ with open(completion_template_arg, "r") as filep:
78
+ template = json.load(filep)
79
+ try:
80
+ fim_position = FimPosition[template["fim_position"]]
81
+ except KeyError:
82
+ raise ValueError(
83
+ f"Unknown fim position: {template['fim_position']}"
84
+ ) from None
85
+ register_completion_template(
86
+ CompletionTemplate(
87
+ name=template["name"],
88
+ fim_begin_token=template["fim_begin_token"],
89
+ fim_middle_token=template["fim_middle_token"],
90
+ fim_end_token=template["fim_end_token"],
91
+ fim_position=fim_position,
92
+ ),
93
+ override=True,
94
+ )
95
+ completion_template_name = template["name"]
96
+ else:
97
+ completion_template_name = completion_template_arg
98
+
99
+
100
+ def register_completion_template(template: CompletionTemplate, override: bool = False):
101
+ """Register a new completion template."""
102
+ if not override:
103
+ assert (
104
+ template.name not in completion_templates
105
+ ), f"{template.name} has been registered."
106
+
107
+ completion_templates[template.name] = template
108
+
109
+
110
+ def completion_template_exists(template_name: str) -> bool:
111
+ return template_name in completion_templates
112
+
113
+
114
+ def is_completion_template_defined() -> bool:
115
+ global completion_template_name
116
+ return completion_template_name != None
117
+
118
+
119
+ def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:
120
+ global completion_template_name
121
+ if request.suffix == "":
122
+ return request.prompt
123
+
124
+ return generate_completion_prompt(
125
+ request.prompt, request.suffix, completion_template_name
126
+ )
127
+
128
+
129
+ def generate_completion_prompt(prompt: str, suffix: str, template_name: str) -> str:
130
+
131
+ completion_template = completion_templates[template_name]
132
+ fim_begin_token = completion_template.fim_begin_token
133
+ fim_middle_token = completion_template.fim_middle_token
134
+ fim_end_token = completion_template.fim_end_token
135
+ fim_position = completion_template.fim_position
136
+
137
+ if fim_position == FimPosition.MIDDLE:
138
+ prompt = f"{fim_begin_token}{prompt}{fim_middle_token}{suffix}{fim_end_token}"
139
+ elif fim_position == FimPosition.END:
140
+ prompt = f"{fim_begin_token}{prompt}{fim_end_token}{suffix}{fim_middle_token}"
141
+
142
+ return prompt
143
+
144
+
145
+ register_completion_template(
146
+ CompletionTemplate(
147
+ name="deepseek_coder",
148
+ fim_begin_token="<|fim▁begin|>",
149
+ fim_middle_token="<|fim▁hole|>",
150
+ fim_end_token="<|fim▁end|>",
151
+ fim_position=FimPosition.MIDDLE,
152
+ )
153
+ )
154
+
155
+
156
+ register_completion_template(
157
+ CompletionTemplate(
158
+ name="star_coder",
159
+ fim_begin_token="<fim_prefix>",
160
+ fim_middle_token="<fim_middle>",
161
+ fim_end_token="<fim_suffix>",
162
+ fim_position=FimPosition.END,
163
+ )
164
+ )
165
+
166
+ register_completion_template(
167
+ CompletionTemplate(
168
+ name="qwen_coder",
169
+ fim_begin_token="<|fim_prefix|>",
170
+ fim_middle_token="<|fim_middle|>",
171
+ fim_end_token="<|fim_suffix|>",
172
+ fim_position=FimPosition.END,
173
+ )
174
+ )
@@ -1,17 +1,13 @@
1
1
  from sglang.srt.configs.chatglm import ChatGLMConfig
2
2
  from sglang.srt.configs.dbrx import DbrxConfig
3
+ from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
3
4
  from sglang.srt.configs.exaone import ExaoneConfig
4
5
  from sglang.srt.configs.janus_pro import MultiModalityConfig
5
- from sglang.srt.configs.qwen2_5_vl_config import (
6
- Qwen2_5_VLConfig,
7
- Qwen2_5_VLVisionConfig,
8
- )
9
6
 
10
7
  __all__ = [
11
8
  "ExaoneConfig",
12
9
  "ChatGLMConfig",
13
10
  "DbrxConfig",
14
- "Qwen2_5_VLConfig",
15
- "Qwen2_5_VLVisionConfig",
11
+ "DeepseekVL2Config",
16
12
  "MultiModalityConfig",
17
13
  ]