sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +6 -0
- sglang/bench_one_batch.py +1 -1
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +3 -1
- sglang/check_env.py +3 -4
- sglang/lang/backend/openai.py +18 -5
- sglang/lang/chat_template.py +28 -7
- sglang/lang/interpreter.py +7 -3
- sglang/lang/ir.py +10 -0
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/code_completion_parser.py +174 -0
- sglang/srt/configs/__init__.py +2 -6
- sglang/srt/configs/deepseekvl2.py +667 -0
- sglang/srt/configs/janus_pro.py +3 -4
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +63 -11
- sglang/srt/configs/utils.py +25 -0
- sglang/srt/connector/__init__.py +51 -0
- sglang/srt/connector/base_connector.py +112 -0
- sglang/srt/connector/redis.py +85 -0
- sglang/srt/connector/s3.py +122 -0
- sglang/srt/connector/serde/__init__.py +31 -0
- sglang/srt/connector/serde/safe_serde.py +29 -0
- sglang/srt/connector/serde/serde.py +43 -0
- sglang/srt/connector/utils.py +35 -0
- sglang/srt/conversation.py +88 -0
- sglang/srt/disaggregation/conn.py +81 -0
- sglang/srt/disaggregation/decode.py +495 -0
- sglang/srt/disaggregation/mini_lb.py +285 -0
- sglang/srt/disaggregation/prefill.py +249 -0
- sglang/srt/disaggregation/utils.py +44 -0
- sglang/srt/distributed/parallel_state.py +10 -3
- sglang/srt/entrypoints/engine.py +55 -5
- sglang/srt/entrypoints/http_server.py +71 -12
- sglang/srt/function_call_parser.py +164 -54
- sglang/srt/hf_transformers_utils.py +28 -3
- sglang/srt/layers/activation.py +4 -2
- sglang/srt/layers/attention/base_attn_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +295 -0
- sglang/srt/layers/attention/flashinfer_backend.py +1 -1
- sglang/srt/layers/attention/flashmla_backend.py +284 -0
- sglang/srt/layers/attention/triton_backend.py +171 -38
- sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
- sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
- sglang/srt/layers/attention/utils.py +53 -0
- sglang/srt/layers/attention/vision.py +9 -28
- sglang/srt/layers/dp_attention.py +62 -23
- sglang/srt/layers/elementwise.py +411 -0
- sglang/srt/layers/layernorm.py +24 -2
- sglang/srt/layers/linear.py +17 -5
- sglang/srt/layers/logits_processor.py +26 -7
- sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
- sglang/srt/layers/moe/ep_moe/layer.py +273 -1
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
- sglang/srt/layers/moe/fused_moe_native.py +2 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
- sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
- sglang/srt/layers/moe/router.py +342 -0
- sglang/srt/layers/moe/topk.py +31 -18
- sglang/srt/layers/parameter.py +1 -1
- sglang/srt/layers/quantization/__init__.py +184 -126
- sglang/srt/layers/quantization/base_config.py +5 -0
- sglang/srt/layers/quantization/blockwise_int8.py +1 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
- sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
- sglang/srt/layers/quantization/fp8.py +76 -34
- sglang/srt/layers/quantization/fp8_kernel.py +24 -8
- sglang/srt/layers/quantization/fp8_utils.py +284 -28
- sglang/srt/layers/quantization/gptq.py +36 -9
- sglang/srt/layers/quantization/kv_cache.py +98 -0
- sglang/srt/layers/quantization/modelopt_quant.py +9 -7
- sglang/srt/layers/quantization/utils.py +153 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
- sglang/srt/layers/rotary_embedding.py +66 -87
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/layers.py +68 -0
- sglang/srt/lora/lora.py +2 -22
- sglang/srt/lora/lora_manager.py +47 -23
- sglang/srt/lora/mem_pool.py +110 -51
- sglang/srt/lora/utils.py +12 -1
- sglang/srt/managers/cache_controller.py +4 -5
- sglang/srt/managers/data_parallel_controller.py +31 -9
- sglang/srt/managers/expert_distribution.py +81 -0
- sglang/srt/managers/io_struct.py +39 -3
- sglang/srt/managers/mm_utils.py +373 -0
- sglang/srt/managers/multimodal_processor.py +68 -0
- sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
- sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
- sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
- sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
- sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
- sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
- sglang/srt/managers/schedule_batch.py +134 -31
- sglang/srt/managers/scheduler.py +325 -38
- sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +59 -23
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
- sglang/srt/managers/utils.py +6 -1
- sglang/srt/mem_cache/hiradix_cache.py +27 -8
- sglang/srt/mem_cache/memory_pool.py +258 -98
- sglang/srt/mem_cache/paged_allocator.py +2 -2
- sglang/srt/mem_cache/radix_cache.py +4 -4
- sglang/srt/model_executor/cuda_graph_runner.py +85 -28
- sglang/srt/model_executor/forward_batch_info.py +81 -15
- sglang/srt/model_executor/model_runner.py +70 -6
- sglang/srt/model_loader/loader.py +160 -2
- sglang/srt/model_loader/weight_utils.py +45 -0
- sglang/srt/models/deepseek_janus_pro.py +29 -86
- sglang/srt/models/deepseek_nextn.py +22 -10
- sglang/srt/models/deepseek_v2.py +326 -192
- sglang/srt/models/deepseek_vl2.py +358 -0
- sglang/srt/models/gemma3_causal.py +684 -0
- sglang/srt/models/gemma3_mm.py +462 -0
- sglang/srt/models/grok.py +374 -119
- sglang/srt/models/llama.py +47 -7
- sglang/srt/models/llama_eagle.py +1 -0
- sglang/srt/models/llama_eagle3.py +196 -0
- sglang/srt/models/llava.py +3 -3
- sglang/srt/models/llavavid.py +3 -3
- sglang/srt/models/minicpmo.py +1995 -0
- sglang/srt/models/minicpmv.py +62 -137
- sglang/srt/models/mllama.py +4 -4
- sglang/srt/models/phi3_small.py +1 -1
- sglang/srt/models/qwen2.py +3 -0
- sglang/srt/models/qwen2_5_vl.py +68 -146
- sglang/srt/models/qwen2_classification.py +75 -0
- sglang/srt/models/qwen2_moe.py +9 -1
- sglang/srt/models/qwen2_vl.py +25 -63
- sglang/srt/openai_api/adapter.py +145 -47
- sglang/srt/openai_api/protocol.py +23 -2
- sglang/srt/sampling/sampling_batch_info.py +1 -1
- sglang/srt/sampling/sampling_params.py +6 -6
- sglang/srt/server_args.py +104 -14
- sglang/srt/speculative/build_eagle_tree.py +7 -347
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
- sglang/srt/speculative/eagle_utils.py +208 -252
- sglang/srt/speculative/eagle_worker.py +139 -53
- sglang/srt/speculative/spec_info.py +6 -1
- sglang/srt/torch_memory_saver_adapter.py +22 -0
- sglang/srt/utils.py +182 -21
- sglang/test/__init__.py +0 -0
- sglang/test/attention/__init__.py +0 -0
- sglang/test/attention/test_flashattn_backend.py +312 -0
- sglang/test/runners.py +2 -0
- sglang/test/test_activation.py +2 -1
- sglang/test/test_block_fp8.py +5 -4
- sglang/test/test_block_fp8_ep.py +2 -1
- sglang/test/test_dynamic_grad_mode.py +58 -0
- sglang/test/test_layernorm.py +3 -2
- sglang/test/test_utils.py +55 -4
- sglang/utils.py +31 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
- sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
- sglang/srt/managers/image_processor.py +0 -55
- sglang/srt/managers/image_processors/base_image_processor.py +0 -219
- sglang/srt/managers/image_processors/minicpmv.py +0 -86
- sglang/srt/managers/multi_modality_padding.py +0 -134
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
- {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -32,6 +32,7 @@ from sglang.lang.choices import (
|
|
32
32
|
)
|
33
33
|
from sglang.utils import LazyImport
|
34
34
|
|
35
|
+
ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
|
35
36
|
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
36
37
|
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
37
38
|
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
@@ -67,6 +68,7 @@ __all__ = [
|
|
67
68
|
"greedy_token_selection",
|
68
69
|
"token_length_normalized",
|
69
70
|
"unconditional_likelihood_normalized",
|
71
|
+
"ServerArgs",
|
70
72
|
"Anthropic",
|
71
73
|
"LiteLLM",
|
72
74
|
"OpenAI",
|
sglang/api.py
CHANGED
@@ -75,6 +75,7 @@ def gen(
|
|
75
75
|
name: Optional[str] = None,
|
76
76
|
max_tokens: Optional[int] = None,
|
77
77
|
min_tokens: Optional[int] = None,
|
78
|
+
n: Optional[int] = None,
|
78
79
|
stop: Optional[Union[str, List[str]]] = None,
|
79
80
|
stop_token_ids: Optional[List[int]] = None,
|
80
81
|
temperature: Optional[float] = None,
|
@@ -115,6 +116,7 @@ def gen(
|
|
115
116
|
name,
|
116
117
|
max_tokens,
|
117
118
|
min_tokens,
|
119
|
+
n,
|
118
120
|
stop,
|
119
121
|
stop_token_ids,
|
120
122
|
temperature,
|
@@ -137,6 +139,7 @@ def gen(
|
|
137
139
|
def gen_int(
|
138
140
|
name: Optional[str] = None,
|
139
141
|
max_tokens: Optional[int] = None,
|
142
|
+
n: Optional[int] = None,
|
140
143
|
stop: Optional[Union[str, List[str]]] = None,
|
141
144
|
stop_token_ids: Optional[List[int]] = None,
|
142
145
|
temperature: Optional[float] = None,
|
@@ -155,6 +158,7 @@ def gen_int(
|
|
155
158
|
name,
|
156
159
|
max_tokens,
|
157
160
|
None,
|
161
|
+
n,
|
158
162
|
stop,
|
159
163
|
stop_token_ids,
|
160
164
|
temperature,
|
@@ -176,6 +180,7 @@ def gen_int(
|
|
176
180
|
def gen_string(
|
177
181
|
name: Optional[str] = None,
|
178
182
|
max_tokens: Optional[int] = None,
|
183
|
+
n: Optional[int] = None,
|
179
184
|
stop: Optional[Union[str, List[str]]] = None,
|
180
185
|
stop_token_ids: Optional[List[int]] = None,
|
181
186
|
temperature: Optional[float] = None,
|
@@ -194,6 +199,7 @@ def gen_string(
|
|
194
199
|
name,
|
195
200
|
max_tokens,
|
196
201
|
None,
|
202
|
+
n,
|
197
203
|
stop,
|
198
204
|
stop_token_ids,
|
199
205
|
temperature,
|
sglang/bench_one_batch.py
CHANGED
@@ -117,7 +117,7 @@ class BenchArgs:
|
|
117
117
|
|
118
118
|
@classmethod
|
119
119
|
def from_cli_args(cls, args: argparse.Namespace):
|
120
|
-
# use the default value's type to
|
120
|
+
# use the default value's type to cast the args into correct types.
|
121
121
|
attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
|
122
122
|
return cls(
|
123
123
|
**{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
|
sglang/bench_one_batch_server.py
CHANGED
@@ -57,7 +57,7 @@ class BenchArgs:
|
|
57
57
|
|
58
58
|
@classmethod
|
59
59
|
def from_cli_args(cls, args: argparse.Namespace):
|
60
|
-
# use the default value's type to
|
60
|
+
# use the default value's type to cast the args into correct types.
|
61
61
|
attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
|
62
62
|
return cls(
|
63
63
|
**{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
|
sglang/bench_serving.py
CHANGED
@@ -128,7 +128,7 @@ async def async_request_trt_llm(
|
|
128
128
|
timestamp = time.perf_counter()
|
129
129
|
# First token
|
130
130
|
if ttft == 0.0:
|
131
|
-
ttft =
|
131
|
+
ttft = timestamp - st
|
132
132
|
output.ttft = ttft
|
133
133
|
|
134
134
|
# Decoding phase
|
@@ -501,6 +501,7 @@ def get_dataset(args, tokenizer):
|
|
501
501
|
question_len=args.gsp_question_len,
|
502
502
|
output_len=args.gsp_output_len,
|
503
503
|
tokenizer=tokenizer,
|
504
|
+
args=args,
|
504
505
|
)
|
505
506
|
else:
|
506
507
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
@@ -788,6 +789,7 @@ def sample_generated_shared_prefix_requests(
|
|
788
789
|
question_len: int,
|
789
790
|
output_len: int,
|
790
791
|
tokenizer: PreTrainedTokenizerBase,
|
792
|
+
args: argparse.Namespace,
|
791
793
|
) -> List[Tuple[str, int, int]]:
|
792
794
|
"""Generate benchmark requests with shared system prompts using random tokens and caching."""
|
793
795
|
cache_path = get_gen_prefix_cache_path(args, tokenizer)
|
sglang/check_env.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Check environment configurations and dependency versions."""
|
2
2
|
|
3
|
-
import importlib
|
3
|
+
import importlib.metadata
|
4
4
|
import os
|
5
5
|
import resource
|
6
6
|
import subprocess
|
@@ -59,9 +59,8 @@ def get_package_versions(packages):
|
|
59
59
|
for package in packages:
|
60
60
|
package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
|
61
61
|
try:
|
62
|
-
|
63
|
-
|
64
|
-
versions[package_name] = module.__version__
|
62
|
+
version = importlib.metadata.version(package_name)
|
63
|
+
versions[package_name] = version
|
65
64
|
except ModuleNotFoundError:
|
66
65
|
versions[package_name] = "Module Not Found"
|
67
66
|
return versions
|
sglang/lang/backend/openai.py
CHANGED
@@ -165,6 +165,7 @@ class OpenAI(BaseBackend):
|
|
165
165
|
kwargs.pop("max_tokens", None)
|
166
166
|
else:
|
167
167
|
kwargs.pop("max_completion_tokens", None)
|
168
|
+
|
168
169
|
comp = openai_completion(
|
169
170
|
client=self.client,
|
170
171
|
token_usage=self.token_usage,
|
@@ -173,13 +174,13 @@ class OpenAI(BaseBackend):
|
|
173
174
|
prompt=prompt,
|
174
175
|
**kwargs,
|
175
176
|
)
|
177
|
+
# Keep the returned list (or string) as is.
|
176
178
|
elif sampling_params.dtype in [str, "str", "string"]:
|
177
179
|
assert (
|
178
180
|
not self.is_chat_model
|
179
181
|
), "constrained type not supported on chat model"
|
180
182
|
kwargs = sampling_params.to_openai_kwargs()
|
181
183
|
kwargs.pop("stop")
|
182
|
-
|
183
184
|
comp = openai_completion(
|
184
185
|
client=self.client,
|
185
186
|
token_usage=self.token_usage,
|
@@ -189,7 +190,11 @@ class OpenAI(BaseBackend):
|
|
189
190
|
stop='"',
|
190
191
|
**kwargs,
|
191
192
|
)
|
192
|
-
|
193
|
+
# Wrap each element in quotes if we have a list.
|
194
|
+
if isinstance(comp, list):
|
195
|
+
comp = ['"' + x + '"' for x in comp]
|
196
|
+
else:
|
197
|
+
comp = '"' + comp + '"'
|
193
198
|
elif sampling_params.dtype in [int, "int"]:
|
194
199
|
assert (
|
195
200
|
not self.is_chat_model
|
@@ -206,6 +211,7 @@ class OpenAI(BaseBackend):
|
|
206
211
|
stop=[" "],
|
207
212
|
**kwargs,
|
208
213
|
)
|
214
|
+
# Leave as a list if that's what is returned.
|
209
215
|
else:
|
210
216
|
raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
|
211
217
|
|
@@ -254,7 +260,9 @@ class OpenAI(BaseBackend):
|
|
254
260
|
prompt=s.messages_,
|
255
261
|
**self.spec_kwargs,
|
256
262
|
)
|
257
|
-
|
263
|
+
# Use a string for pattern matching.
|
264
|
+
comp_for_match = comp[0] if isinstance(comp, list) else comp
|
265
|
+
if self.spec_pattern_match(comp_for_match):
|
258
266
|
break
|
259
267
|
|
260
268
|
for term in self.spec_format:
|
@@ -370,7 +378,7 @@ class OpenAI(BaseBackend):
|
|
370
378
|
|
371
379
|
def openai_completion(
|
372
380
|
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
373
|
-
):
|
381
|
+
) -> Union[str, List[str]]:
|
374
382
|
# if "ebnf" is in kwargs, warn and remove
|
375
383
|
if "ebnf" in kwargs:
|
376
384
|
warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
|
@@ -382,13 +390,18 @@ def openai_completion(
|
|
382
390
|
if "stop" in kwargs and kwargs["stop"] is None:
|
383
391
|
kwargs.pop("stop")
|
384
392
|
ret = client.chat.completions.create(messages=prompt, **kwargs)
|
385
|
-
|
393
|
+
if len(ret.choices) == 1:
|
394
|
+
comp = ret.choices[0].message.content
|
395
|
+
else:
|
396
|
+
comp = [c.message.content for c in ret.choices]
|
386
397
|
else:
|
387
398
|
ret = client.completions.create(prompt=prompt, **kwargs)
|
388
399
|
if isinstance(prompt, (list, tuple)):
|
389
400
|
comp = [c.text for c in ret.choices]
|
390
401
|
else:
|
391
402
|
comp = ret.choices[0].text
|
403
|
+
if len(ret.choices) > 1:
|
404
|
+
comp = [c.text for c in ret.choices]
|
392
405
|
|
393
406
|
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
394
407
|
token_usage.completion_tokens += ret.usage.completion_tokens
|
sglang/lang/chat_template.py
CHANGED
@@ -15,6 +15,7 @@ class ChatTemplate:
|
|
15
15
|
role_prefix_and_suffix: Dict[str, Tuple[str, str]]
|
16
16
|
stop_str: List[str] = ()
|
17
17
|
image_token: str = "<image>"
|
18
|
+
audio_token: str = "<audio>"
|
18
19
|
style: ChatTemplateStyle = ChatTemplateStyle.PLAIN
|
19
20
|
|
20
21
|
def get_prefix_and_suffix(
|
@@ -253,6 +254,22 @@ register_chat_template(
|
|
253
254
|
)
|
254
255
|
)
|
255
256
|
|
257
|
+
# https://huggingface.co/openbmb/MiniCPM-o-2_6
|
258
|
+
register_chat_template(
|
259
|
+
ChatTemplate(
|
260
|
+
name="minicpmo",
|
261
|
+
default_system_prompt=None,
|
262
|
+
role_prefix_and_suffix={
|
263
|
+
"system": ("", " "),
|
264
|
+
"user": ("user:", " "),
|
265
|
+
"assistant": ("assistant:", "</s>"),
|
266
|
+
},
|
267
|
+
stop_str=("<|im_end|>", "<|endoftext|>"),
|
268
|
+
image_token="(<image>./</image>)",
|
269
|
+
audio_token="(<audio>./</audio>)",
|
270
|
+
)
|
271
|
+
)
|
272
|
+
|
256
273
|
# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
|
257
274
|
register_chat_template(
|
258
275
|
ChatTemplate(
|
@@ -474,12 +491,6 @@ def match_chat_ml(model_path: str):
|
|
474
491
|
return get_chat_template("chatml-llava")
|
475
492
|
|
476
493
|
|
477
|
-
@register_chat_template_matching_function
|
478
|
-
def match_chat_minicpm(model_path: str):
|
479
|
-
if "minicpm" in model_path:
|
480
|
-
return get_chat_template("minicpmv")
|
481
|
-
|
482
|
-
|
483
494
|
@register_chat_template_matching_function
|
484
495
|
def match_chat_yi(model_path: str):
|
485
496
|
model_path = model_path.lower()
|
@@ -499,8 +510,10 @@ def match_gemma_it(model_path: str):
|
|
499
510
|
@register_chat_template_matching_function
|
500
511
|
def match_openbmb_minicpm(model_path: str):
|
501
512
|
model_path = model_path.lower()
|
502
|
-
if "minicpm" in model_path:
|
513
|
+
if "minicpm-v" in model_path:
|
503
514
|
return get_chat_template("minicpmv")
|
515
|
+
elif "minicpm-o" in model_path:
|
516
|
+
return get_chat_template("minicpmo")
|
504
517
|
|
505
518
|
|
506
519
|
@register_chat_template_matching_function
|
@@ -520,6 +533,14 @@ def match_granite_instruct(model_path: str):
|
|
520
533
|
return get_chat_template("granite-3-instruct")
|
521
534
|
|
522
535
|
|
536
|
+
@register_chat_template_matching_function
|
537
|
+
def match_gemma3_instruct(model_path: str):
|
538
|
+
model_path = model_path.lower()
|
539
|
+
if "gemma-3" in model_path and "1b" not in model_path:
|
540
|
+
# gemma-3-1b-it is completion model
|
541
|
+
return get_chat_template("gemma-it")
|
542
|
+
|
543
|
+
|
523
544
|
if __name__ == "__main__":
|
524
545
|
messages = [
|
525
546
|
{"role": "system", "content": None}, # None means default
|
sglang/lang/interpreter.py
CHANGED
@@ -566,13 +566,13 @@ class StreamExecutor:
|
|
566
566
|
def _execute_gen(self, expr: SglGen):
|
567
567
|
sampling_params = self._resolve_sampling_params(expr.sampling_params)
|
568
568
|
name = expr.name
|
569
|
-
|
570
569
|
if not self.stream:
|
571
570
|
if self.num_api_spec_tokens is None:
|
572
571
|
comp, meta_info = self.backend.generate(
|
573
572
|
self,
|
574
573
|
sampling_params=sampling_params,
|
575
574
|
)
|
575
|
+
|
576
576
|
else:
|
577
577
|
if self.backend.is_chat_model:
|
578
578
|
# Speculative execution on models with only chat interface.
|
@@ -587,8 +587,11 @@ class StreamExecutor:
|
|
587
587
|
|
588
588
|
else: # Speculative execution on models with completion interface
|
589
589
|
comp, meta_info = self._spec_gen(sampling_params)
|
590
|
-
|
591
|
-
|
590
|
+
if isinstance(comp, list):
|
591
|
+
self.text_ += comp[0]
|
592
|
+
else:
|
593
|
+
assert isinstance(comp, str)
|
594
|
+
self.text_ += comp
|
592
595
|
|
593
596
|
self.variables[name] = comp
|
594
597
|
self.meta_info[name] = meta_info
|
@@ -747,6 +750,7 @@ class StreamExecutor:
|
|
747
750
|
for item in [
|
748
751
|
"max_new_tokens",
|
749
752
|
"min_new_tokens",
|
753
|
+
"n",
|
750
754
|
"stop",
|
751
755
|
"stop_token_ids",
|
752
756
|
"temperature",
|
sglang/lang/ir.py
CHANGED
@@ -18,6 +18,7 @@ REGEX_STR = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg
|
|
18
18
|
class SglSamplingParams:
|
19
19
|
max_new_tokens: int = 128
|
20
20
|
min_new_tokens: int = 0
|
21
|
+
n: int = 1
|
21
22
|
stop: Union[str, List[str]] = ()
|
22
23
|
stop_token_ids: Optional[List[int]] = ()
|
23
24
|
temperature: float = 1.0
|
@@ -41,6 +42,7 @@ class SglSamplingParams:
|
|
41
42
|
return SglSamplingParams(
|
42
43
|
self.max_new_tokens,
|
43
44
|
self.min_new_tokens,
|
45
|
+
self.n,
|
44
46
|
self.stop,
|
45
47
|
self.stop_token_ids,
|
46
48
|
self.temperature,
|
@@ -64,6 +66,7 @@ class SglSamplingParams:
|
|
64
66
|
return {
|
65
67
|
"max_tokens": self.max_new_tokens,
|
66
68
|
"max_completion_tokens": self.max_new_tokens,
|
69
|
+
"n": self.n,
|
67
70
|
"stop": self.stop or None,
|
68
71
|
"temperature": self.temperature,
|
69
72
|
"top_p": self.top_p,
|
@@ -117,6 +120,7 @@ class SglSamplingParams:
|
|
117
120
|
return {
|
118
121
|
"max_new_tokens": self.max_new_tokens,
|
119
122
|
"min_new_tokens": self.min_new_tokens,
|
123
|
+
"n": self.n,
|
120
124
|
"stop": self.stop,
|
121
125
|
"stop_token_ids": self.stop_token_ids,
|
122
126
|
"temperature": self.temperature,
|
@@ -154,6 +158,7 @@ class SglFunction:
|
|
154
158
|
self,
|
155
159
|
*args,
|
156
160
|
max_new_tokens: int = 128,
|
161
|
+
n: int = 1,
|
157
162
|
stop: Optional[Union[str, List[str]]] = None,
|
158
163
|
stop_token_ids: Optional[List[int]] = None,
|
159
164
|
temperature: float = 1.0,
|
@@ -182,6 +187,7 @@ class SglFunction:
|
|
182
187
|
|
183
188
|
default_sampling_para = SglSamplingParams(
|
184
189
|
max_new_tokens=max_new_tokens,
|
190
|
+
n=n,
|
185
191
|
stop=stop,
|
186
192
|
stop_token_ids=stop_token_ids,
|
187
193
|
temperature=temperature,
|
@@ -212,6 +218,7 @@ class SglFunction:
|
|
212
218
|
batch_kwargs,
|
213
219
|
*,
|
214
220
|
max_new_tokens: int = 128,
|
221
|
+
n: int = 1,
|
215
222
|
stop: Optional[Union[str, List[str]]] = None,
|
216
223
|
stop_token_ids: Optional[List[int]] = None,
|
217
224
|
temperature: float = 1.0,
|
@@ -257,6 +264,7 @@ class SglFunction:
|
|
257
264
|
|
258
265
|
default_sampling_para = SglSamplingParams(
|
259
266
|
max_new_tokens=max_new_tokens,
|
267
|
+
n=n,
|
260
268
|
stop=stop,
|
261
269
|
stop_token_ids=stop_token_ids,
|
262
270
|
temperature=temperature,
|
@@ -440,6 +448,7 @@ class SglGen(SglExpr):
|
|
440
448
|
name: Optional[str] = None,
|
441
449
|
max_new_tokens: Optional[int] = None,
|
442
450
|
min_new_tokens: Optional[int] = None,
|
451
|
+
n: Optional[int] = None,
|
443
452
|
stop: Optional[Union[str, List[str]]] = None,
|
444
453
|
stop_token_ids: Optional[List[int]] = None,
|
445
454
|
temperature: Optional[float] = None,
|
@@ -463,6 +472,7 @@ class SglGen(SglExpr):
|
|
463
472
|
self.sampling_params = SglSamplingParams(
|
464
473
|
max_new_tokens=max_new_tokens,
|
465
474
|
min_new_tokens=min_new_tokens,
|
475
|
+
n=n,
|
466
476
|
stop=stop,
|
467
477
|
stop_token_ids=stop_token_ids,
|
468
478
|
temperature=temperature,
|
sglang/srt/_custom_ops.py
CHANGED
@@ -10,7 +10,7 @@ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
|
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
use_vllm_custom_allreduce = get_bool_env_var(
|
13
|
-
"USE_VLLM_CUSTOM_ALLREDUCE", default="
|
13
|
+
"USE_VLLM_CUSTOM_ALLREDUCE", default="false"
|
14
14
|
)
|
15
15
|
|
16
16
|
if not is_hpu():
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
"""Completion templates."""
|
15
|
+
|
16
|
+
|
17
|
+
import dataclasses
|
18
|
+
import json
|
19
|
+
import logging
|
20
|
+
import os
|
21
|
+
from enum import auto
|
22
|
+
|
23
|
+
from sglang.srt.openai_api.protocol import ChatCompletionRequest
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
completion_template_name = None
|
27
|
+
|
28
|
+
|
29
|
+
class FimPosition:
|
30
|
+
"""Postion of fim middle token."""
|
31
|
+
|
32
|
+
MIDDLE = auto()
|
33
|
+
END = auto()
|
34
|
+
|
35
|
+
|
36
|
+
@dataclasses.dataclass
|
37
|
+
class CompletionTemplate:
|
38
|
+
"""A class that manages completion prompt templates. only for code completion currently."""
|
39
|
+
|
40
|
+
# The name of this template
|
41
|
+
name: str
|
42
|
+
|
43
|
+
# the fim begin token
|
44
|
+
fim_begin_token: str
|
45
|
+
|
46
|
+
# The fim middle token
|
47
|
+
fim_middle_token: str
|
48
|
+
|
49
|
+
# The fim end token
|
50
|
+
fim_end_token: str
|
51
|
+
|
52
|
+
# The position of the fim middle token
|
53
|
+
fim_position: FimPosition
|
54
|
+
|
55
|
+
|
56
|
+
# A global registry for all completion templates
|
57
|
+
completion_templates: dict[str, CompletionTemplate] = {}
|
58
|
+
|
59
|
+
|
60
|
+
def load_completion_template_for_openai_api(completion_template_arg):
|
61
|
+
global completion_template_name
|
62
|
+
|
63
|
+
logger.info(
|
64
|
+
f"Use completion template for the OpenAI-compatible API server: {completion_template_arg}"
|
65
|
+
)
|
66
|
+
|
67
|
+
if not completion_template_exists(completion_template_arg):
|
68
|
+
if not os.path.exists(completion_template_arg):
|
69
|
+
raise RuntimeError(
|
70
|
+
f"Completion template {completion_template_arg} is not a built-in template name "
|
71
|
+
"or a valid completion template file path."
|
72
|
+
)
|
73
|
+
|
74
|
+
assert completion_template_arg.endswith(
|
75
|
+
".json"
|
76
|
+
), "unrecognized format of completion template file"
|
77
|
+
with open(completion_template_arg, "r") as filep:
|
78
|
+
template = json.load(filep)
|
79
|
+
try:
|
80
|
+
fim_position = FimPosition[template["fim_position"]]
|
81
|
+
except KeyError:
|
82
|
+
raise ValueError(
|
83
|
+
f"Unknown fim position: {template['fim_position']}"
|
84
|
+
) from None
|
85
|
+
register_completion_template(
|
86
|
+
CompletionTemplate(
|
87
|
+
name=template["name"],
|
88
|
+
fim_begin_token=template["fim_begin_token"],
|
89
|
+
fim_middle_token=template["fim_middle_token"],
|
90
|
+
fim_end_token=template["fim_end_token"],
|
91
|
+
fim_position=fim_position,
|
92
|
+
),
|
93
|
+
override=True,
|
94
|
+
)
|
95
|
+
completion_template_name = template["name"]
|
96
|
+
else:
|
97
|
+
completion_template_name = completion_template_arg
|
98
|
+
|
99
|
+
|
100
|
+
def register_completion_template(template: CompletionTemplate, override: bool = False):
|
101
|
+
"""Register a new completion template."""
|
102
|
+
if not override:
|
103
|
+
assert (
|
104
|
+
template.name not in completion_templates
|
105
|
+
), f"{template.name} has been registered."
|
106
|
+
|
107
|
+
completion_templates[template.name] = template
|
108
|
+
|
109
|
+
|
110
|
+
def completion_template_exists(template_name: str) -> bool:
|
111
|
+
return template_name in completion_templates
|
112
|
+
|
113
|
+
|
114
|
+
def is_completion_template_defined() -> bool:
|
115
|
+
global completion_template_name
|
116
|
+
return completion_template_name != None
|
117
|
+
|
118
|
+
|
119
|
+
def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:
|
120
|
+
global completion_template_name
|
121
|
+
if request.suffix == "":
|
122
|
+
return request.prompt
|
123
|
+
|
124
|
+
return generate_completion_prompt(
|
125
|
+
request.prompt, request.suffix, completion_template_name
|
126
|
+
)
|
127
|
+
|
128
|
+
|
129
|
+
def generate_completion_prompt(prompt: str, suffix: str, template_name: str) -> str:
|
130
|
+
|
131
|
+
completion_template = completion_templates[template_name]
|
132
|
+
fim_begin_token = completion_template.fim_begin_token
|
133
|
+
fim_middle_token = completion_template.fim_middle_token
|
134
|
+
fim_end_token = completion_template.fim_end_token
|
135
|
+
fim_position = completion_template.fim_position
|
136
|
+
|
137
|
+
if fim_position == FimPosition.MIDDLE:
|
138
|
+
prompt = f"{fim_begin_token}{prompt}{fim_middle_token}{suffix}{fim_end_token}"
|
139
|
+
elif fim_position == FimPosition.END:
|
140
|
+
prompt = f"{fim_begin_token}{prompt}{fim_end_token}{suffix}{fim_middle_token}"
|
141
|
+
|
142
|
+
return prompt
|
143
|
+
|
144
|
+
|
145
|
+
register_completion_template(
|
146
|
+
CompletionTemplate(
|
147
|
+
name="deepseek_coder",
|
148
|
+
fim_begin_token="<|fim▁begin|>",
|
149
|
+
fim_middle_token="<|fim▁hole|>",
|
150
|
+
fim_end_token="<|fim▁end|>",
|
151
|
+
fim_position=FimPosition.MIDDLE,
|
152
|
+
)
|
153
|
+
)
|
154
|
+
|
155
|
+
|
156
|
+
register_completion_template(
|
157
|
+
CompletionTemplate(
|
158
|
+
name="star_coder",
|
159
|
+
fim_begin_token="<fim_prefix>",
|
160
|
+
fim_middle_token="<fim_middle>",
|
161
|
+
fim_end_token="<fim_suffix>",
|
162
|
+
fim_position=FimPosition.END,
|
163
|
+
)
|
164
|
+
)
|
165
|
+
|
166
|
+
register_completion_template(
|
167
|
+
CompletionTemplate(
|
168
|
+
name="qwen_coder",
|
169
|
+
fim_begin_token="<|fim_prefix|>",
|
170
|
+
fim_middle_token="<|fim_middle|>",
|
171
|
+
fim_end_token="<|fim_suffix|>",
|
172
|
+
fim_position=FimPosition.END,
|
173
|
+
)
|
174
|
+
)
|
sglang/srt/configs/__init__.py
CHANGED
@@ -1,17 +1,13 @@
|
|
1
1
|
from sglang.srt.configs.chatglm import ChatGLMConfig
|
2
2
|
from sglang.srt.configs.dbrx import DbrxConfig
|
3
|
+
from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
|
3
4
|
from sglang.srt.configs.exaone import ExaoneConfig
|
4
5
|
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
5
|
-
from sglang.srt.configs.qwen2_5_vl_config import (
|
6
|
-
Qwen2_5_VLConfig,
|
7
|
-
Qwen2_5_VLVisionConfig,
|
8
|
-
)
|
9
6
|
|
10
7
|
__all__ = [
|
11
8
|
"ExaoneConfig",
|
12
9
|
"ChatGLMConfig",
|
13
10
|
"DbrxConfig",
|
14
|
-
"
|
15
|
-
"Qwen2_5_VLVisionConfig",
|
11
|
+
"DeepseekVL2Config",
|
16
12
|
"MultiModalityConfig",
|
17
13
|
]
|