sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +89 -54
- sglang/bench_serving.py +437 -40
- sglang/lang/interpreter.py +1 -1
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +90 -27
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +82 -26
- sglang/srt/entrypoints/openai/serving_completions.py +25 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +28 -7
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +381 -136
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +11 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -8
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +111 -56
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
- sglang/srt/layers/quantization/fp8.py +78 -48
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +45 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +93 -68
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +396 -365
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +18 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +190 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +148 -122
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +77 -480
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +53 -40
- sglang/srt/mem_cache/hiradix_cache.py +196 -104
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +395 -53
- sglang/srt/mem_cache/memory_pool_host.py +27 -19
- sglang/srt/mem_cache/radix_cache.py +6 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +190 -32
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +323 -53
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +7 -19
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +91 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{conversation.py → parser/conversation.py} +38 -5
- sglang/srt/parser/harmony_parser.py +588 -0
- sglang/srt/parser/reasoning_parser.py +309 -0
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +307 -80
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +96 -7
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- sglang/srt/reasoning_parser.py +0 -553
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,309 @@
|
|
1
|
+
import re
|
2
|
+
from typing import Dict, Optional, Tuple, Type
|
3
|
+
|
4
|
+
from sglang.srt.parser.harmony_parser import HarmonyParser
|
5
|
+
|
6
|
+
|
7
|
+
class StreamingParseResult:
|
8
|
+
"""Result of streaming incremental parsing."""
|
9
|
+
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
normal_text: Optional[str] = None,
|
13
|
+
reasoning_text: Optional[str] = None,
|
14
|
+
):
|
15
|
+
self.normal_text = normal_text or ""
|
16
|
+
self.reasoning_text = reasoning_text or ""
|
17
|
+
|
18
|
+
|
19
|
+
class BaseReasoningFormatDetector:
|
20
|
+
"""Base class providing two sets of interfaces: one-time and streaming incremental."""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
think_start_token: str,
|
25
|
+
think_end_token: str,
|
26
|
+
force_reasoning: bool = False,
|
27
|
+
stream_reasoning: bool = True,
|
28
|
+
):
|
29
|
+
self.think_start_token = think_start_token
|
30
|
+
self.think_end_token = think_end_token
|
31
|
+
self._in_reasoning = force_reasoning
|
32
|
+
self.stream_reasoning = stream_reasoning
|
33
|
+
|
34
|
+
self._buffer = ""
|
35
|
+
self.stripped_think_start = False
|
36
|
+
|
37
|
+
def detect_and_parse(self, text: str) -> StreamingParseResult:
|
38
|
+
"""
|
39
|
+
One-time parsing: Detects and parses reasoning sections in the provided text.
|
40
|
+
Returns both reasoning content and normal text separately.
|
41
|
+
"""
|
42
|
+
in_reasoning = self._in_reasoning or self.think_start_token in text
|
43
|
+
|
44
|
+
if not in_reasoning:
|
45
|
+
return StreamingParseResult(normal_text=text)
|
46
|
+
|
47
|
+
# The text is considered to be in a reasoning block.
|
48
|
+
processed_text = text.replace(self.think_start_token, "").strip()
|
49
|
+
|
50
|
+
if self.think_end_token not in processed_text:
|
51
|
+
# Assume reasoning was truncated before `</think>` token
|
52
|
+
return StreamingParseResult(reasoning_text=processed_text)
|
53
|
+
|
54
|
+
# Extract reasoning content
|
55
|
+
splits = processed_text.split(self.think_end_token, maxsplit=1)
|
56
|
+
reasoning_text = splits[0]
|
57
|
+
normal_text = splits[1].strip()
|
58
|
+
|
59
|
+
return StreamingParseResult(
|
60
|
+
normal_text=normal_text, reasoning_text=reasoning_text
|
61
|
+
)
|
62
|
+
|
63
|
+
def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
|
64
|
+
"""
|
65
|
+
Streaming incremental parsing for reasoning content.
|
66
|
+
Handles partial reasoning tags and content.
|
67
|
+
|
68
|
+
If stream_reasoning is False:
|
69
|
+
Accumulates reasoning content until the end tag is found
|
70
|
+
If stream_reasoning is True:
|
71
|
+
Streams reasoning content as it arrives
|
72
|
+
"""
|
73
|
+
self._buffer += new_text
|
74
|
+
current_text = self._buffer
|
75
|
+
|
76
|
+
# If the current text is a prefix of the think token, keep buffering
|
77
|
+
if any(
|
78
|
+
token.startswith(current_text) and token != current_text
|
79
|
+
for token in [self.think_start_token, self.think_end_token]
|
80
|
+
):
|
81
|
+
return StreamingParseResult()
|
82
|
+
|
83
|
+
# Strip `<think>` token if present
|
84
|
+
if not self.stripped_think_start and self.think_start_token in current_text:
|
85
|
+
current_text = current_text.replace(self.think_start_token, "")
|
86
|
+
self.stripped_think_start = True
|
87
|
+
self._in_reasoning = True
|
88
|
+
|
89
|
+
# Handle end of reasoning block
|
90
|
+
if self._in_reasoning and self.think_end_token in current_text:
|
91
|
+
end_idx = current_text.find(self.think_end_token)
|
92
|
+
|
93
|
+
reasoning_text = current_text[:end_idx]
|
94
|
+
|
95
|
+
self._buffer = ""
|
96
|
+
self._in_reasoning = False
|
97
|
+
normal_text = current_text[end_idx + len(self.think_end_token) :]
|
98
|
+
|
99
|
+
return StreamingParseResult(
|
100
|
+
normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
|
101
|
+
)
|
102
|
+
|
103
|
+
# Continue with reasoning content
|
104
|
+
if self._in_reasoning:
|
105
|
+
if self.stream_reasoning:
|
106
|
+
# Stream the content immediately
|
107
|
+
self._buffer = ""
|
108
|
+
return StreamingParseResult(reasoning_text=current_text)
|
109
|
+
else:
|
110
|
+
return StreamingParseResult()
|
111
|
+
|
112
|
+
# If we're not in a reasoning block return as normal text
|
113
|
+
if not self._in_reasoning:
|
114
|
+
self._buffer = ""
|
115
|
+
return StreamingParseResult(normal_text=current_text)
|
116
|
+
|
117
|
+
return StreamingParseResult()
|
118
|
+
|
119
|
+
|
120
|
+
class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
121
|
+
"""
|
122
|
+
Detector for DeepSeek-R1 model.
|
123
|
+
Assumes reasoning format:
|
124
|
+
(<think>)*(.*)</think>
|
125
|
+
Returns all the text before the </think> tag as `reasoning_text`
|
126
|
+
and the rest of the text as `normal_text`.
|
127
|
+
|
128
|
+
Supported models:
|
129
|
+
- DeepSeek-R1: Always generates thinking content without <think> start tag
|
130
|
+
- DeepSeek-R1-0528: Generates thinking content with <think> start tag
|
131
|
+
|
132
|
+
Format patterns:
|
133
|
+
- DeepSeek-R1: "I need to think about this...</think>The answer is 42."
|
134
|
+
- DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
|
135
|
+
|
136
|
+
Args:
|
137
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
138
|
+
If True, streams reasoning content as it arrives.
|
139
|
+
"""
|
140
|
+
|
141
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
142
|
+
# DeepSeek-R1 is assumed to be reasoning until `</think>` token
|
143
|
+
super().__init__(
|
144
|
+
"<think>",
|
145
|
+
"</think>",
|
146
|
+
force_reasoning=True,
|
147
|
+
stream_reasoning=stream_reasoning,
|
148
|
+
)
|
149
|
+
# https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
|
150
|
+
|
151
|
+
|
152
|
+
class Qwen3Detector(BaseReasoningFormatDetector):
|
153
|
+
"""
|
154
|
+
Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
|
155
|
+
Assumes reasoning format:
|
156
|
+
(<think>)*(.*)</think>
|
157
|
+
|
158
|
+
Qwen3 models released before 07/2025 supports switching between thinking mode and normal
|
159
|
+
mode using `enable_thinking` parameter in the request parameter.
|
160
|
+
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
|
161
|
+
- enable_thinking=False: "The answer is 42." (no thinking tokens)
|
162
|
+
|
163
|
+
Args:
|
164
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
165
|
+
If True, streams reasoning content as it arrives.
|
166
|
+
"""
|
167
|
+
|
168
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
|
169
|
+
super().__init__(
|
170
|
+
"<think>",
|
171
|
+
"</think>",
|
172
|
+
force_reasoning=force_reasoning,
|
173
|
+
stream_reasoning=stream_reasoning,
|
174
|
+
)
|
175
|
+
|
176
|
+
|
177
|
+
class KimiDetector(BaseReasoningFormatDetector):
|
178
|
+
"""
|
179
|
+
Detector for Kimi Thinking model.
|
180
|
+
Assumes reasoning format:
|
181
|
+
◁think▷*(.*)◁/think▷
|
182
|
+
Returns all the text before the ◁/think▷ tag as `reasoning_text`
|
183
|
+
and the rest of the text as `normal_text`.
|
184
|
+
"""
|
185
|
+
|
186
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
|
187
|
+
super().__init__(
|
188
|
+
"◁think▷",
|
189
|
+
"◁/think▷",
|
190
|
+
force_reasoning=False,
|
191
|
+
stream_reasoning=stream_reasoning,
|
192
|
+
)
|
193
|
+
|
194
|
+
|
195
|
+
class GptOssDetector(BaseReasoningFormatDetector):
|
196
|
+
"""
|
197
|
+
Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
|
198
|
+
"""
|
199
|
+
|
200
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
201
|
+
super().__init__(
|
202
|
+
"<|channel|>analysis<|message|>",
|
203
|
+
"<|end|>",
|
204
|
+
force_reasoning=force_reasoning,
|
205
|
+
stream_reasoning=stream_reasoning,
|
206
|
+
)
|
207
|
+
self.parser = HarmonyParser()
|
208
|
+
|
209
|
+
def detect_and_parse(self, text: str) -> StreamingParseResult:
|
210
|
+
events = self.parser.parse(text)
|
211
|
+
# Flush the buffer for one-shot parsing
|
212
|
+
events += self.parser.parse("")
|
213
|
+
|
214
|
+
reasoning_text = "".join(
|
215
|
+
[e.content for e in events if e.event_type == "reasoning"]
|
216
|
+
)
|
217
|
+
normal_parts = []
|
218
|
+
for e in events:
|
219
|
+
if e.event_type == "normal":
|
220
|
+
normal_parts.append(e.content)
|
221
|
+
elif e.event_type == "tool_call":
|
222
|
+
# Use raw_text to preserve structural markers for function call detector
|
223
|
+
normal_parts.append(e.raw_text if e.raw_text else e.content)
|
224
|
+
normal_text = "".join(normal_parts)
|
225
|
+
# Tool call events preserve raw text with structural markers
|
226
|
+
|
227
|
+
return StreamingParseResult(
|
228
|
+
normal_text=normal_text,
|
229
|
+
reasoning_text=reasoning_text,
|
230
|
+
)
|
231
|
+
|
232
|
+
def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
|
233
|
+
events = self.parser.parse(new_text)
|
234
|
+
|
235
|
+
reasoning_text = "".join(
|
236
|
+
[e.content for e in events if e.event_type == "reasoning"]
|
237
|
+
)
|
238
|
+
normal_parts = []
|
239
|
+
for e in events:
|
240
|
+
if e.event_type == "normal":
|
241
|
+
normal_parts.append(e.content)
|
242
|
+
elif e.event_type == "tool_call":
|
243
|
+
# Use raw_text to preserve structural markers for function call detector
|
244
|
+
normal_parts.append(e.raw_text if e.raw_text else e.content)
|
245
|
+
normal_text = "".join(normal_parts)
|
246
|
+
|
247
|
+
return StreamingParseResult(
|
248
|
+
normal_text=normal_text,
|
249
|
+
reasoning_text=reasoning_text,
|
250
|
+
)
|
251
|
+
|
252
|
+
|
253
|
+
class ReasoningParser:
|
254
|
+
"""
|
255
|
+
Parser that handles both streaming and non-streaming scenarios for extracting
|
256
|
+
reasoning content from model outputs.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
model_type (str): Type of model to parse reasoning from
|
260
|
+
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
261
|
+
If True, streams reasoning content as it arrives.
|
262
|
+
"""
|
263
|
+
|
264
|
+
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
265
|
+
"deepseek-r1": DeepSeekR1Detector,
|
266
|
+
"deepseek-v3": Qwen3Detector,
|
267
|
+
"glm45": Qwen3Detector,
|
268
|
+
"gpt-oss": GptOssDetector,
|
269
|
+
"kimi": KimiDetector,
|
270
|
+
"qwen3": Qwen3Detector,
|
271
|
+
"qwen3-thinking": Qwen3Detector,
|
272
|
+
"step3": DeepSeekR1Detector,
|
273
|
+
}
|
274
|
+
|
275
|
+
def __init__(
|
276
|
+
self,
|
277
|
+
model_type: Optional[str] = None,
|
278
|
+
stream_reasoning: bool = True,
|
279
|
+
force_reasoning: Optional[bool] = None,
|
280
|
+
):
|
281
|
+
if not model_type:
|
282
|
+
raise ValueError("Model type must be specified")
|
283
|
+
|
284
|
+
detector_class = self.DetectorMap.get(model_type.lower())
|
285
|
+
if not detector_class:
|
286
|
+
raise ValueError(f"Unsupported model type: {model_type}")
|
287
|
+
|
288
|
+
# Special cases where we override force_reasoning
|
289
|
+
if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
|
290
|
+
force_reasoning = True
|
291
|
+
|
292
|
+
# Only pass force_reasoning if explicitly set, let detectors use their defaults
|
293
|
+
kwargs = {"stream_reasoning": stream_reasoning}
|
294
|
+
if force_reasoning is not None:
|
295
|
+
kwargs["force_reasoning"] = force_reasoning
|
296
|
+
|
297
|
+
self.detector = detector_class(**kwargs)
|
298
|
+
|
299
|
+
def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
|
300
|
+
"""Non-streaming call: one-time parsing"""
|
301
|
+
ret = self.detector.detect_and_parse(full_text)
|
302
|
+
return ret.reasoning_text, ret.normal_text
|
303
|
+
|
304
|
+
def parse_stream_chunk(
|
305
|
+
self, chunk_text: str
|
306
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
307
|
+
"""Streaming call: incremental parsing"""
|
308
|
+
ret = self.detector.parse_streaming_increment(chunk_text)
|
309
|
+
return ret.reasoning_text, ret.normal_text
|
@@ -1,7 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import abc
|
4
|
-
|
4
|
+
import weakref
|
5
|
+
from typing import TYPE_CHECKING, Optional, Set, Type
|
5
6
|
|
6
7
|
import torch
|
7
8
|
|
@@ -17,7 +18,7 @@ class BatchedPenalizerOrchestrator:
|
|
17
18
|
penalizers: Set[Type["_BatchedPenalizer"]],
|
18
19
|
):
|
19
20
|
self.vocab_size = vocab_size
|
20
|
-
self.
|
21
|
+
self._batch_ref = weakref.ref(batch)
|
21
22
|
self.device = batch.device
|
22
23
|
self.penalizers = {Penalizer: Penalizer(self) for Penalizer in penalizers}
|
23
24
|
|
@@ -27,6 +28,17 @@ class BatchedPenalizerOrchestrator:
|
|
27
28
|
is_required |= pen_is_required
|
28
29
|
self.is_required = is_required
|
29
30
|
|
31
|
+
@property
|
32
|
+
def batch(self) -> ScheduleBatch | None:
|
33
|
+
return self._batch_ref()
|
34
|
+
|
35
|
+
@batch.setter
|
36
|
+
def batch(self, value: Optional[ScheduleBatch]):
|
37
|
+
if value is None:
|
38
|
+
self._batch_ref = lambda: None
|
39
|
+
else:
|
40
|
+
self._batch_ref = weakref.ref(value)
|
41
|
+
|
30
42
|
def reqs(self):
|
31
43
|
return self.batch.reqs
|
32
44
|
|
@@ -67,28 +67,31 @@ class SamplingBatchInfo:
|
|
67
67
|
logit_bias: Optional[torch.Tensor] = None
|
68
68
|
|
69
69
|
@classmethod
|
70
|
-
def
|
70
|
+
def _get_global_server_args_dict(cls):
|
71
71
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
72
72
|
|
73
|
+
return global_server_args_dict
|
74
|
+
|
75
|
+
@classmethod
|
76
|
+
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
77
|
+
global_server_args_dict = cls._get_global_server_args_dict()
|
78
|
+
|
73
79
|
reqs = batch.reqs
|
74
80
|
device = batch.device
|
75
|
-
temperatures = (
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
.view(-1, 1)
|
81
|
-
.to(device, non_blocking=True)
|
82
|
-
)
|
81
|
+
temperatures = torch.tensor(
|
82
|
+
[r.sampling_params.temperature for r in reqs],
|
83
|
+
dtype=torch.float,
|
84
|
+
device=device,
|
85
|
+
).view(-1, 1)
|
83
86
|
top_ps = torch.tensor(
|
84
|
-
[r.sampling_params.top_p for r in reqs], dtype=torch.float
|
85
|
-
)
|
87
|
+
[r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
|
88
|
+
)
|
86
89
|
top_ks = torch.tensor(
|
87
|
-
[r.sampling_params.top_k for r in reqs], dtype=torch.int32
|
88
|
-
)
|
90
|
+
[r.sampling_params.top_k for r in reqs], dtype=torch.int32, device=device
|
91
|
+
)
|
89
92
|
min_ps = torch.tensor(
|
90
|
-
[r.sampling_params.min_p for r in reqs], dtype=torch.float
|
91
|
-
)
|
93
|
+
[r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
|
94
|
+
)
|
92
95
|
|
93
96
|
logit_bias = None
|
94
97
|
if any(r.sampling_params.logit_bias is not None for r in reqs):
|