sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +302 -414
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +13 -8
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +144 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +773 -334
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +225 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +68 -37
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +102 -36
- sglang/srt/model_executor/cuda_graph_runner.py +56 -31
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +280 -81
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +135 -60
- sglang/srt/speculative/build_eagle_tree.py +8 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
- sglang/srt/speculative/eagle_utils.py +92 -57
- sglang/srt/speculative/eagle_worker.py +238 -111
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/models/xverse_moe.py
CHANGED
@@ -43,6 +43,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
43
43
|
)
|
44
44
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
45
45
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
46
|
+
from sglang.srt.utils import add_prefix
|
46
47
|
|
47
48
|
|
48
49
|
class XverseMLP(nn.Module):
|
@@ -54,10 +55,15 @@ class XverseMLP(nn.Module):
|
|
54
55
|
hidden_act: str,
|
55
56
|
quant_config: Optional[QuantizationConfig] = None,
|
56
57
|
reduce_results: bool = True,
|
58
|
+
prefix: str = "",
|
57
59
|
) -> None:
|
58
60
|
super().__init__()
|
59
61
|
self.gate_up_proj = MergedColumnParallelLinear(
|
60
|
-
hidden_size,
|
62
|
+
hidden_size,
|
63
|
+
[intermediate_size] * 2,
|
64
|
+
bias=False,
|
65
|
+
quant_config=quant_config,
|
66
|
+
prefix=add_prefix("gate_up_proj", prefix),
|
61
67
|
)
|
62
68
|
self.down_proj = RowParallelLinear(
|
63
69
|
intermediate_size,
|
@@ -65,6 +71,7 @@ class XverseMLP(nn.Module):
|
|
65
71
|
bias=False,
|
66
72
|
quant_config=quant_config,
|
67
73
|
reduce_results=reduce_results,
|
74
|
+
prefix=add_prefix("down_proj", prefix),
|
68
75
|
)
|
69
76
|
if hidden_act != "silu":
|
70
77
|
raise ValueError(
|
@@ -86,6 +93,7 @@ class XverseMoE(nn.Module):
|
|
86
93
|
self,
|
87
94
|
config: PretrainedConfig,
|
88
95
|
quant_config: Optional[QuantizationConfig] = None,
|
96
|
+
prefix: str = "",
|
89
97
|
):
|
90
98
|
super().__init__()
|
91
99
|
self.config = config
|
@@ -107,14 +115,19 @@ class XverseMoE(nn.Module):
|
|
107
115
|
hidden_act=config.hidden_act,
|
108
116
|
quant_config=quant_config,
|
109
117
|
reduce_results=False,
|
118
|
+
prefix=add_prefix(f"experts.{i}", prefix),
|
110
119
|
)
|
111
|
-
for
|
120
|
+
for i in range(self.n_routed_experts)
|
112
121
|
]
|
113
122
|
)
|
114
123
|
self.pack_params()
|
115
124
|
|
116
125
|
self.router = ReplicatedLinear(
|
117
|
-
config.hidden_size,
|
126
|
+
config.hidden_size,
|
127
|
+
self.n_routed_experts,
|
128
|
+
bias=False,
|
129
|
+
quant_config=None,
|
130
|
+
prefix=add_prefix("router", prefix),
|
118
131
|
)
|
119
132
|
|
120
133
|
if config.num_shared_experts is not None:
|
@@ -125,6 +138,7 @@ class XverseMoE(nn.Module):
|
|
125
138
|
hidden_act=config.hidden_act,
|
126
139
|
quant_config=quant_config,
|
127
140
|
reduce_results=False,
|
141
|
+
prefix=add_prefix("shared_experts", prefix),
|
128
142
|
)
|
129
143
|
|
130
144
|
def pack_params(self):
|
@@ -182,6 +196,7 @@ class XverseAttention(nn.Module):
|
|
182
196
|
rope_scaling: Optional[Dict[str, Any]] = None,
|
183
197
|
max_position_embeddings: int = 8192,
|
184
198
|
quant_config: Optional[QuantizationConfig] = None,
|
199
|
+
prefix: str = "",
|
185
200
|
) -> None:
|
186
201
|
super().__init__()
|
187
202
|
self.hidden_size = hidden_size
|
@@ -213,6 +228,7 @@ class XverseAttention(nn.Module):
|
|
213
228
|
self.total_num_kv_heads,
|
214
229
|
bias=False,
|
215
230
|
quant_config=quant_config,
|
231
|
+
prefix=add_prefix("qkv_proj", prefix),
|
216
232
|
)
|
217
233
|
|
218
234
|
self.o_proj = RowParallelLinear(
|
@@ -220,6 +236,7 @@ class XverseAttention(nn.Module):
|
|
220
236
|
hidden_size,
|
221
237
|
bias=False,
|
222
238
|
quant_config=quant_config,
|
239
|
+
prefix=add_prefix("o_proj", prefix),
|
223
240
|
)
|
224
241
|
|
225
242
|
self.rotary_emb = get_rope(
|
@@ -235,6 +252,7 @@ class XverseAttention(nn.Module):
|
|
235
252
|
self.scaling,
|
236
253
|
num_kv_heads=self.num_kv_heads,
|
237
254
|
layer_id=layer_id,
|
255
|
+
prefix=add_prefix("attn", prefix),
|
238
256
|
)
|
239
257
|
|
240
258
|
def forward(
|
@@ -258,6 +276,7 @@ class XverseDecoderLayer(nn.Module):
|
|
258
276
|
config: PretrainedConfig,
|
259
277
|
layer_id: int,
|
260
278
|
quant_config: Optional[QuantizationConfig] = None,
|
279
|
+
prefix: str = "",
|
261
280
|
) -> None:
|
262
281
|
super().__init__()
|
263
282
|
self.hidden_size = config.hidden_size
|
@@ -276,15 +295,21 @@ class XverseDecoderLayer(nn.Module):
|
|
276
295
|
rope_scaling=rope_scaling,
|
277
296
|
max_position_embeddings=max_position_embeddings,
|
278
297
|
quant_config=quant_config,
|
298
|
+
prefix=add_prefix("self_attn", prefix),
|
279
299
|
)
|
280
300
|
if config.num_experts is not None:
|
281
|
-
self.mlp = XverseMoE(
|
301
|
+
self.mlp = XverseMoE(
|
302
|
+
config=config,
|
303
|
+
quant_config=quant_config,
|
304
|
+
prefix=add_prefix("mlp", prefix),
|
305
|
+
)
|
282
306
|
else:
|
283
307
|
self.mlp = XverseMLP(
|
284
308
|
hidden_size=config.hidden_size,
|
285
309
|
intermediate_size=config.intermediate_size,
|
286
310
|
hidden_act=config.hidden_act,
|
287
311
|
quant_config=quant_config,
|
312
|
+
prefix=add_prefix("mlp", prefix),
|
288
313
|
)
|
289
314
|
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
290
315
|
self.post_attention_layernorm = RMSNorm(
|
@@ -324,6 +349,7 @@ class XverseModel(nn.Module):
|
|
324
349
|
self,
|
325
350
|
config: PretrainedConfig,
|
326
351
|
quant_config: Optional[QuantizationConfig] = None,
|
352
|
+
prefix: str = "",
|
327
353
|
) -> None:
|
328
354
|
super().__init__()
|
329
355
|
self.padding_idx = config.pad_token_id
|
@@ -332,10 +358,16 @@ class XverseModel(nn.Module):
|
|
332
358
|
self.embed_tokens = VocabParallelEmbedding(
|
333
359
|
config.vocab_size,
|
334
360
|
config.hidden_size,
|
361
|
+
prefix=add_prefix("embed_tokens", prefix),
|
335
362
|
)
|
336
363
|
self.layers = nn.ModuleList(
|
337
364
|
[
|
338
|
-
XverseDecoderLayer(
|
365
|
+
XverseDecoderLayer(
|
366
|
+
config,
|
367
|
+
layer_id,
|
368
|
+
quant_config=quant_config,
|
369
|
+
prefix=add_prefix(f"layers.{layer_id}", prefix),
|
370
|
+
)
|
339
371
|
for layer_id in range(config.num_hidden_layers)
|
340
372
|
]
|
341
373
|
)
|
@@ -364,13 +396,19 @@ class XverseMoeForCausalLM(nn.Module):
|
|
364
396
|
self,
|
365
397
|
config: PretrainedConfig,
|
366
398
|
quant_config: Optional[QuantizationConfig] = None,
|
399
|
+
prefix: str = "",
|
367
400
|
) -> None:
|
368
401
|
super().__init__()
|
369
402
|
self.config = config
|
370
403
|
self.quant_config = quant_config
|
371
|
-
self.model = XverseModel(
|
404
|
+
self.model = XverseModel(
|
405
|
+
config, quant_config, prefix=add_prefix("model", prefix)
|
406
|
+
)
|
372
407
|
self.lm_head = ParallelLMHead(
|
373
|
-
config.vocab_size,
|
408
|
+
config.vocab_size,
|
409
|
+
config.hidden_size,
|
410
|
+
quant_config=quant_config,
|
411
|
+
prefix=add_prefix("lm_head", prefix),
|
374
412
|
)
|
375
413
|
self.logits_processor = LogitsProcessor(config)
|
376
414
|
|
sglang/srt/models/yivl.py
CHANGED
@@ -29,8 +29,9 @@ class YiVLForCausalLM(LlavaLlamaForCausalLM):
|
|
29
29
|
self,
|
30
30
|
config: LlavaConfig,
|
31
31
|
quant_config: Optional[QuantizationConfig] = None,
|
32
|
+
prefix: str = "",
|
32
33
|
) -> None:
|
33
|
-
super().__init__(config, quant_config)
|
34
|
+
super().__init__(config, quant_config, prefix=prefix)
|
34
35
|
|
35
36
|
self.multi_modal_projector = YiVLMultiModalProjector(self.config)
|
36
37
|
self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -26,8 +26,6 @@ from fastapi import HTTPException, Request, UploadFile
|
|
26
26
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
27
27
|
from pydantic import ValidationError
|
28
28
|
|
29
|
-
from sglang.lang.chat_template import get_chat_template_by_model_path
|
30
|
-
|
31
29
|
try:
|
32
30
|
from outlines.fsm.json_schema import convert_json_schema_to_str
|
33
31
|
except ImportError:
|
@@ -74,6 +72,7 @@ from sglang.srt.openai_api.protocol import (
|
|
74
72
|
TopLogprob,
|
75
73
|
UsageInfo,
|
76
74
|
)
|
75
|
+
from sglang.srt.reasoning_parser import ReasoningParser
|
77
76
|
from sglang.utils import get_exception_traceback
|
78
77
|
|
79
78
|
logger = logging.getLogger(__name__)
|
@@ -165,24 +164,19 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, mode
|
|
165
164
|
else:
|
166
165
|
chat_template_name = chat_template_arg
|
167
166
|
|
168
|
-
#
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
used_chat_template = chat_template_name
|
173
|
-
if official_chat_template != used_chat_template:
|
174
|
-
logger.warning(
|
175
|
-
f"Using a chat_template: '{used_chat_template}', "
|
176
|
-
f"which is different from official chat template: '{official_chat_template}', "
|
177
|
-
f"This discrepancy may lead to performance degradation."
|
178
|
-
)
|
167
|
+
# Check chat-template
|
168
|
+
# TODO:
|
169
|
+
# 1. Do not import any code from sglang.lang
|
170
|
+
# 2. For VLM, when chat_template_arg is None, set it automatically by guessing from model_path.
|
179
171
|
|
180
172
|
|
181
|
-
async def v1_files_create(
|
173
|
+
async def v1_files_create(
|
174
|
+
file: UploadFile, purpose: str, file_storage_path: str = None
|
175
|
+
):
|
182
176
|
try:
|
183
177
|
global storage_dir
|
184
|
-
if
|
185
|
-
storage_dir =
|
178
|
+
if file_storage_path:
|
179
|
+
storage_dir = file_storage_path
|
186
180
|
# Read the file content
|
187
181
|
file_content = await file.read()
|
188
182
|
|
@@ -941,7 +935,13 @@ def v1_chat_generate_request(
|
|
941
935
|
)
|
942
936
|
|
943
937
|
if assistant_prefix:
|
944
|
-
|
938
|
+
encoded = tokenizer_manager.tokenizer.encode(assistant_prefix)
|
939
|
+
if (
|
940
|
+
encoded
|
941
|
+
and encoded[0] == tokenizer_manager.tokenizer.bos_token_id
|
942
|
+
):
|
943
|
+
encoded = encoded[1:]
|
944
|
+
prompt_ids += encoded
|
945
945
|
stop = request.stop
|
946
946
|
image_data = None
|
947
947
|
modalities = []
|
@@ -988,10 +988,17 @@ def v1_chat_generate_request(
|
|
988
988
|
"ignore_eos": request.ignore_eos,
|
989
989
|
"skip_special_tokens": request.skip_special_tokens,
|
990
990
|
}
|
991
|
+
|
991
992
|
if request.response_format and request.response_format.type == "json_schema":
|
992
993
|
sampling_params["json_schema"] = convert_json_schema_to_str(
|
993
994
|
request.response_format.json_schema.schema_
|
994
995
|
)
|
996
|
+
elif (
|
997
|
+
request.response_format and request.response_format.type == "structural_tag"
|
998
|
+
):
|
999
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
1000
|
+
request.response_format.model_dump(by_alias=True)
|
1001
|
+
)
|
995
1002
|
sampling_params_list.append(sampling_params)
|
996
1003
|
|
997
1004
|
image_data_list.append(image_data)
|
@@ -1032,7 +1039,12 @@ def v1_chat_generate_request(
|
|
1032
1039
|
|
1033
1040
|
|
1034
1041
|
def v1_chat_generate_response(
|
1035
|
-
request,
|
1042
|
+
request,
|
1043
|
+
ret,
|
1044
|
+
to_file=False,
|
1045
|
+
cache_report=False,
|
1046
|
+
tool_call_parser=None,
|
1047
|
+
reasoning_parser=None,
|
1036
1048
|
):
|
1037
1049
|
choices = []
|
1038
1050
|
|
@@ -1086,9 +1098,26 @@ def v1_chat_generate_response(
|
|
1086
1098
|
if isinstance(request, list):
|
1087
1099
|
tool_choice = request[idx].tool_choice
|
1088
1100
|
tools = request[idx].tools
|
1101
|
+
separate_reasoning = request[idx].separate_reasoning
|
1089
1102
|
else:
|
1090
1103
|
tool_choice = request.tool_choice
|
1091
1104
|
tools = request.tools
|
1105
|
+
separate_reasoning = request.separate_reasoning
|
1106
|
+
|
1107
|
+
if reasoning_parser and separate_reasoning:
|
1108
|
+
try:
|
1109
|
+
parser = ReasoningParser(
|
1110
|
+
model_type=reasoning_parser, stream_reasoning=False
|
1111
|
+
)
|
1112
|
+
reasoning_text, text = parser.parse_non_stream(text)
|
1113
|
+
except Exception as e:
|
1114
|
+
logger.error(f"Exception: {e}")
|
1115
|
+
return create_error_response(
|
1116
|
+
HTTPStatus.BAD_REQUEST,
|
1117
|
+
"Failed to parse reasoning related info to json format!",
|
1118
|
+
)
|
1119
|
+
else:
|
1120
|
+
reasoning_text = None
|
1092
1121
|
|
1093
1122
|
if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
|
1094
1123
|
if finish_reason == "stop":
|
@@ -1118,8 +1147,9 @@ def v1_chat_generate_response(
|
|
1118
1147
|
"index": 0,
|
1119
1148
|
"message": {
|
1120
1149
|
"role": "assistant",
|
1121
|
-
"content":
|
1150
|
+
"content": text if tool_calls is None else None,
|
1122
1151
|
"tool_calls": tool_calls,
|
1152
|
+
"reasoning_content": reasoning_text,
|
1123
1153
|
},
|
1124
1154
|
"logprobs": choice_logprobs,
|
1125
1155
|
"finish_reason": (finish_reason["type"] if finish_reason else ""),
|
@@ -1134,8 +1164,9 @@ def v1_chat_generate_response(
|
|
1134
1164
|
index=idx,
|
1135
1165
|
message=ChatMessage(
|
1136
1166
|
role="assistant",
|
1137
|
-
content=
|
1167
|
+
content=text if tool_calls is None else None,
|
1138
1168
|
tool_calls=tool_calls,
|
1169
|
+
reasoning_content=reasoning_text,
|
1139
1170
|
),
|
1140
1171
|
logprobs=choice_logprobs,
|
1141
1172
|
finish_reason=(finish_reason["type"] if finish_reason else ""),
|
@@ -1202,6 +1233,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1202
1233
|
|
1203
1234
|
if adapted_request.stream:
|
1204
1235
|
parser_dict = {}
|
1236
|
+
reasoning_parser_dict = {}
|
1205
1237
|
|
1206
1238
|
async def generate_stream_resp():
|
1207
1239
|
is_firsts = {}
|
@@ -1268,15 +1300,27 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1268
1300
|
choice_logprobs = None
|
1269
1301
|
|
1270
1302
|
finish_reason = content["meta_info"]["finish_reason"]
|
1303
|
+
finish_reason_type = (
|
1304
|
+
finish_reason["type"] if finish_reason else None
|
1305
|
+
)
|
1271
1306
|
|
1272
1307
|
if is_first:
|
1273
1308
|
# First chunk with role
|
1274
1309
|
is_first = False
|
1310
|
+
if (
|
1311
|
+
tokenizer_manager.server_args.reasoning_parser
|
1312
|
+
and request.separate_reasoning
|
1313
|
+
):
|
1314
|
+
delta = DeltaMessage(role="assistant", reasoning_content="")
|
1315
|
+
else:
|
1316
|
+
delta = DeltaMessage(role="assistant", content="")
|
1275
1317
|
choice_data = ChatCompletionResponseStreamChoice(
|
1276
1318
|
index=index,
|
1277
|
-
delta=
|
1319
|
+
delta=delta,
|
1278
1320
|
finish_reason=(
|
1279
|
-
|
1321
|
+
None
|
1322
|
+
if finish_reason_type and len(finish_reason_type) == 0
|
1323
|
+
else finish_reason_type
|
1280
1324
|
),
|
1281
1325
|
matched_stop=(
|
1282
1326
|
finish_reason["matched"]
|
@@ -1296,6 +1340,41 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1296
1340
|
delta = text[len(stream_buffer) :]
|
1297
1341
|
new_stream_buffer = stream_buffer + delta
|
1298
1342
|
|
1343
|
+
if (
|
1344
|
+
tokenizer_manager.server_args.reasoning_parser
|
1345
|
+
and request.separate_reasoning
|
1346
|
+
):
|
1347
|
+
if index not in reasoning_parser_dict:
|
1348
|
+
reasoning_parser_dict[index] = ReasoningParser(
|
1349
|
+
tokenizer_manager.server_args.reasoning_parser,
|
1350
|
+
request.stream_reasoning,
|
1351
|
+
)
|
1352
|
+
reasoning_parser = reasoning_parser_dict[index]
|
1353
|
+
reasoning_text, delta = reasoning_parser.parse_stream_chunk(
|
1354
|
+
delta
|
1355
|
+
)
|
1356
|
+
if reasoning_text:
|
1357
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
1358
|
+
index=index,
|
1359
|
+
delta=DeltaMessage(reasoning_content=reasoning_text),
|
1360
|
+
finish_reason=(
|
1361
|
+
None
|
1362
|
+
if finish_reason_type
|
1363
|
+
and len(finish_reason_type) == 0
|
1364
|
+
else finish_reason_type
|
1365
|
+
),
|
1366
|
+
)
|
1367
|
+
chunk = ChatCompletionStreamResponse(
|
1368
|
+
id=content["meta_info"]["id"],
|
1369
|
+
choices=[choice_data],
|
1370
|
+
model=request.model,
|
1371
|
+
)
|
1372
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
1373
|
+
if (delta and len(delta) == 0) or not delta:
|
1374
|
+
stream_buffers[index] = new_stream_buffer
|
1375
|
+
is_firsts[index] = is_first
|
1376
|
+
continue
|
1377
|
+
|
1299
1378
|
if request.tool_choice != "none" and request.tools:
|
1300
1379
|
if index not in parser_dict:
|
1301
1380
|
parser_dict[index] = FunctionCallParser(
|
@@ -1313,7 +1392,10 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1313
1392
|
index=index,
|
1314
1393
|
delta=DeltaMessage(content=normal_text),
|
1315
1394
|
finish_reason=(
|
1316
|
-
|
1395
|
+
None
|
1396
|
+
if finish_reason_type
|
1397
|
+
and len(finish_reason_type) == 0
|
1398
|
+
else finish_reason_type
|
1317
1399
|
),
|
1318
1400
|
)
|
1319
1401
|
chunk = ChatCompletionStreamResponse(
|
@@ -1382,7 +1464,9 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1382
1464
|
index=index,
|
1383
1465
|
delta=DeltaMessage(content=delta),
|
1384
1466
|
finish_reason=(
|
1385
|
-
|
1467
|
+
None
|
1468
|
+
if finish_reason_type and len(finish_reason_type) == 0
|
1469
|
+
else finish_reason_type
|
1386
1470
|
),
|
1387
1471
|
matched_stop=(
|
1388
1472
|
finish_reason["matched"]
|
@@ -1450,6 +1534,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
|
1450
1534
|
ret,
|
1451
1535
|
cache_report=tokenizer_manager.server_args.enable_cache_report,
|
1452
1536
|
tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
|
1537
|
+
reasoning_parser=tokenizer_manager.server_args.reasoning_parser,
|
1453
1538
|
)
|
1454
1539
|
|
1455
1540
|
return response
|
@@ -258,6 +258,18 @@ class ResponseFormat(BaseModel):
|
|
258
258
|
json_schema: Optional[JsonSchemaResponseFormat] = None
|
259
259
|
|
260
260
|
|
261
|
+
class StructuresResponseFormat(BaseModel):
|
262
|
+
begin: str
|
263
|
+
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
264
|
+
end: str
|
265
|
+
|
266
|
+
|
267
|
+
class StructuralTagResponseFormat(BaseModel):
|
268
|
+
type: Literal["structural_tag"]
|
269
|
+
structures: List[StructuresResponseFormat]
|
270
|
+
triggers: List[str]
|
271
|
+
|
272
|
+
|
261
273
|
class Function(BaseModel):
|
262
274
|
"""Function descriptions."""
|
263
275
|
|
@@ -298,7 +310,7 @@ class ChatCompletionRequest(BaseModel):
|
|
298
310
|
max_tokens: Optional[int] = None
|
299
311
|
n: int = 1
|
300
312
|
presence_penalty: float = 0.0
|
301
|
-
response_format:
|
313
|
+
response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
|
302
314
|
seed: Optional[int] = None
|
303
315
|
stop: Optional[Union[str, List[str]]] = None
|
304
316
|
stream: bool = False
|
@@ -324,6 +336,8 @@ class ChatCompletionRequest(BaseModel):
|
|
324
336
|
skip_special_tokens: bool = True
|
325
337
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
326
338
|
session_params: Optional[Dict] = None
|
339
|
+
separate_reasoning: bool = True
|
340
|
+
stream_reasoning: bool = True
|
327
341
|
|
328
342
|
|
329
343
|
class FunctionResponse(BaseModel):
|
@@ -344,6 +358,7 @@ class ToolCall(BaseModel):
|
|
344
358
|
class ChatMessage(BaseModel):
|
345
359
|
role: Optional[str] = None
|
346
360
|
content: Optional[str] = None
|
361
|
+
reasoning_content: Optional[str] = None
|
347
362
|
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
348
363
|
|
349
364
|
|
@@ -367,6 +382,7 @@ class ChatCompletionResponse(BaseModel):
|
|
367
382
|
class DeltaMessage(BaseModel):
|
368
383
|
role: Optional[str] = None
|
369
384
|
content: Optional[str] = None
|
385
|
+
reasoning_content: Optional[str] = None
|
370
386
|
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
371
387
|
|
372
388
|
|
@@ -0,0 +1,154 @@
|
|
1
|
+
import re
|
2
|
+
from typing import Dict, Tuple
|
3
|
+
|
4
|
+
|
5
|
+
class StreamingParseResult:
|
6
|
+
"""Result of streaming incremental parsing."""
|
7
|
+
|
8
|
+
def __init__(self, normal_text: str = "", reasoning_text: str = ""):
|
9
|
+
self.normal_text = normal_text
|
10
|
+
self.reasoning_text = reasoning_text
|
11
|
+
|
12
|
+
|
13
|
+
class BaseReasoningFormatDetector:
|
14
|
+
"""Base class providing two sets of interfaces: one-time and streaming incremental."""
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
think_start_token: str,
|
19
|
+
think_end_token: str,
|
20
|
+
force_reasoning: bool = False,
|
21
|
+
stream_reasoning: bool = True,
|
22
|
+
):
|
23
|
+
self.think_start_token = think_start_token
|
24
|
+
self.think_end_token = think_end_token
|
25
|
+
self._in_reasoning = force_reasoning
|
26
|
+
self.stream_reasoning = stream_reasoning
|
27
|
+
|
28
|
+
self._buffer = ""
|
29
|
+
self.stripped_think_start = False
|
30
|
+
|
31
|
+
def detect_and_parse(self, text: str) -> StreamingParseResult:
|
32
|
+
"""
|
33
|
+
One-time parsing: Detects and parses reasoning sections in the provided text.
|
34
|
+
Returns both reasoning content and normal text separately.
|
35
|
+
"""
|
36
|
+
text = text.replace(self.think_start_token, "").strip()
|
37
|
+
if self.think_end_token not in text:
|
38
|
+
# Assume reasoning was truncated before `</think>` token
|
39
|
+
return StreamingParseResult(reasoning_text=text)
|
40
|
+
|
41
|
+
# Extract reasoning content
|
42
|
+
splits = text.split(self.think_end_token, maxsplit=1)
|
43
|
+
reasoning_text = splits[0]
|
44
|
+
text = splits[1].strip()
|
45
|
+
|
46
|
+
return StreamingParseResult(normal_text=text, reasoning_text=reasoning_text)
|
47
|
+
|
48
|
+
def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
|
49
|
+
"""
|
50
|
+
Streaming incremental parsing for reasoning content.
|
51
|
+
Handles partial reasoning tags and content.
|
52
|
+
|
53
|
+
If stream_reasoning is False:
|
54
|
+
Accumulates reasoning content until the end tag is found
|
55
|
+
If stream_reasoning is True:
|
56
|
+
Streams reasoning content as it arrives
|
57
|
+
"""
|
58
|
+
self._buffer += new_text
|
59
|
+
current_text = self._buffer
|
60
|
+
|
61
|
+
# Strip `<think>` token if present
|
62
|
+
if not self.stripped_think_start and self.think_start_token in current_text:
|
63
|
+
current_text = current_text.replace(self.think_start_token, "")
|
64
|
+
self.stripped_think_start = True
|
65
|
+
|
66
|
+
# Handle end of reasoning block
|
67
|
+
if self._in_reasoning and self.think_end_token in current_text:
|
68
|
+
end_idx = current_text.find(self.think_end_token)
|
69
|
+
|
70
|
+
reasoning_text = current_text[:end_idx]
|
71
|
+
|
72
|
+
self._buffer = ""
|
73
|
+
self._in_reasoning = False
|
74
|
+
normal_text = current_text[end_idx + len(self.think_end_token) :]
|
75
|
+
|
76
|
+
return StreamingParseResult(
|
77
|
+
normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
|
78
|
+
)
|
79
|
+
|
80
|
+
# Continue with reasoning content
|
81
|
+
if self._in_reasoning:
|
82
|
+
if self.stream_reasoning:
|
83
|
+
# Stream the content immediately
|
84
|
+
self._buffer = ""
|
85
|
+
return StreamingParseResult(reasoning_text=current_text)
|
86
|
+
else:
|
87
|
+
return StreamingParseResult()
|
88
|
+
|
89
|
+
# If we're not in a reasoning block return as normal text
|
90
|
+
if not self._in_reasoning:
|
91
|
+
self._buffer = ""
|
92
|
+
return StreamingParseResult(normal_text=new_text)
|
93
|
+
|
94
|
+
return StreamingParseResult()
|
95
|
+
|
96
|
+
|
97
|
+
class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
98
|
+
"""
|
99
|
+
Detector for DeepSeek-R1 model.
|
100
|
+
Assumes reasoning format:
|
101
|
+
(<think>)*(.*)</think>
|
102
|
+
Returns all the text before the </think> tag as `reasoning_text`
|
103
|
+
and the rest of the text as `normal_text`.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
107
|
+
If True, streams reasoning content as it arrives.
|
108
|
+
"""
|
109
|
+
|
110
|
+
def __init__(self, stream_reasoning: bool = True):
|
111
|
+
# DeepSeek-R1 is assumed to be reasoning until `</think>` token
|
112
|
+
super().__init__(
|
113
|
+
"<think>",
|
114
|
+
"</think>",
|
115
|
+
force_reasoning=True,
|
116
|
+
stream_reasoning=stream_reasoning,
|
117
|
+
)
|
118
|
+
# https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
|
119
|
+
|
120
|
+
|
121
|
+
class ReasoningParser:
|
122
|
+
"""
|
123
|
+
Parser that handles both streaming and non-streaming scenarios for extracting
|
124
|
+
reasoning content from model outputs.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
model_type (str): Type of model to parse reasoning from
|
128
|
+
stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
|
129
|
+
If True, streams reasoning content as it arrives.
|
130
|
+
"""
|
131
|
+
|
132
|
+
DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
|
133
|
+
"deepseek-r1": DeepSeekR1Detector
|
134
|
+
}
|
135
|
+
|
136
|
+
def __init__(self, model_type: str = None, stream_reasoning: bool = True):
|
137
|
+
if not model_type:
|
138
|
+
raise ValueError("Model type must be specified")
|
139
|
+
|
140
|
+
detector_class = self.DetectorMap.get(model_type.lower())
|
141
|
+
if not detector_class:
|
142
|
+
raise ValueError(f"Unsupported model type: {model_type}")
|
143
|
+
|
144
|
+
self.detector = detector_class(stream_reasoning=stream_reasoning)
|
145
|
+
|
146
|
+
def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
|
147
|
+
"""Non-streaming call: one-time parsing"""
|
148
|
+
ret = self.detector.detect_and_parse(full_text)
|
149
|
+
return ret.reasoning_text, ret.normal_text
|
150
|
+
|
151
|
+
def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, str]:
|
152
|
+
"""Streaming call: incremental parsing"""
|
153
|
+
ret = self.detector.parse_streaming_increment(chunk_text)
|
154
|
+
return ret.reasoning_text, ret.normal_text
|
@@ -1,13 +1,11 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
3
|
-
from .
|
4
|
-
from .
|
5
|
-
from .penalizers.repetition_penalty import BatchedRepetitionPenalizer
|
1
|
+
from sglang.srt.sampling.penaltylib.frequency_penalty import BatchedFrequencyPenalizer
|
2
|
+
from sglang.srt.sampling.penaltylib.min_new_tokens import BatchedMinNewTokensPenalizer
|
3
|
+
from sglang.srt.sampling.penaltylib.orchestrator import BatchedPenalizerOrchestrator
|
4
|
+
from sglang.srt.sampling.penaltylib.presence_penalty import BatchedPresencePenalizer
|
6
5
|
|
7
6
|
__all__ = [
|
8
7
|
"BatchedFrequencyPenalizer",
|
9
8
|
"BatchedMinNewTokensPenalizer",
|
10
9
|
"BatchedPresencePenalizer",
|
11
|
-
"BatchedRepetitionPenalizer",
|
12
10
|
"BatchedPenalizerOrchestrator",
|
13
11
|
]
|