PyPI - sglang - Versions diffs - 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sglang 0.5.1.post2py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (256) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +89 -54
sglang/bench_serving.py +437 -40
sglang/lang/interpreter.py +1 -1
sglang/profiler.py +0 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +37 -7
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +6 -4
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -420
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +6 -4
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +94 -58
sglang/srt/entrypoints/engine.py +34 -14
sglang/srt/entrypoints/http_server.py +172 -47
sglang/srt/entrypoints/openai/protocol.py +90 -27
sglang/srt/entrypoints/openai/serving_base.py +6 -2
sglang/srt/entrypoints/openai/serving_chat.py +82 -26
sglang/srt/entrypoints/openai/serving_completions.py +25 -4
sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/eplb/eplb_manager.py +28 -4
sglang/srt/eplb/expert_distribution.py +55 -15
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +28 -7
sglang/srt/layers/activation.py +44 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +381 -136
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +11 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -8
sglang/srt/layers/layernorm.py +54 -12
sglang/srt/layers/logits_processor.py +10 -3
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_moe.py +0 -8
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +111 -56
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +43 -12
sglang/srt/layers/moe/utils.py +6 -5
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
sglang/srt/layers/quantization/fp8.py +78 -48
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +45 -31
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +107 -40
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +93 -68
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +60 -42
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +83 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +28 -19
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/utils.py +0 -14
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/cache_controller.py +396 -365
sglang/srt/managers/data_parallel_controller.py +30 -15
sglang/srt/managers/detokenizer_manager.py +18 -2
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +190 -11
sglang/srt/managers/mm_utils.py +6 -1
sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
sglang/srt/managers/schedule_batch.py +27 -44
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +148 -122
sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
sglang/srt/managers/tokenizer_manager.py +77 -480
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +53 -40
sglang/srt/mem_cache/hiradix_cache.py +196 -104
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +395 -53
sglang/srt/mem_cache/memory_pool_host.py +27 -19
sglang/srt/mem_cache/radix_cache.py +6 -6
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +1 -3
sglang/srt/metrics/collector.py +484 -63
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +72 -18
sglang/srt/model_executor/model_runner.py +190 -32
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +33 -28
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/deepseek_v2.py +323 -53
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/gpt_oss.py +7 -19
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +17 -0
sglang/srt/models/longcat_flash.py +1026 -0
sglang/srt/models/longcat_flash_nextn.py +699 -0
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +33 -3
sglang/srt/models/qwen2_5_vl.py +91 -42
sglang/srt/models/qwen2_moe.py +79 -14
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/qwen3_next.py +1039 -0
sglang/srt/models/qwen3_next_mtp.py +109 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/{conversation.py → parser/conversation.py} +38 -5
sglang/srt/parser/harmony_parser.py +588 -0
sglang/srt/parser/reasoning_parser.py +309 -0
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +307 -80
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_worker.py +216 -120
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +96 -7
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +181 -8
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_utils.py +25 -1
sglang/utils.py +5 -0
sglang/version.py +1 -1
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
sglang/srt/disaggregation/launch_lb.py +0 -131
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
sglang/srt/reasoning_parser.py +0 -553
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0

sglang/srt/function_call/gpt_oss_detector.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import logging
 import re
-from typing import List
+from typing import List, Optional
 from sglang.srt.entrypoints.openai.protocol import Tool
 from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -10,60 +10,31 @@ from sglang.srt.function_call.core_types import (
     ToolCallItem,
     _GetInfoFunc,
 )
+from sglang.srt.parser.harmony_parser import HarmonyParser
 logger = logging.getLogger(__name__)
 class GptOssDetector(BaseFormatDetector):
     """
-    Detector for T4-style function calls with channel format.
+    Detector for T4-style function calls using HarmonyParser.
-    Supports two formats:
-    1. Direct function call: <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
-    2. Commentary with action plan: <|channel|>commentary<|message|>{content}<|end|>
-    For parallel function calls, each call is self-contained and starts with its own channel:
-    <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"SF"}<|call|>
-    <|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"SF attractions"}<|call|>
-    Examples:
-    Single: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"San Francisco"}<|call|>commentary
-    Multiple: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"Paris"}<|call|>commentary<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"Paris tourism"}<|call|>
-    With Action Plan: <|channel|>commentary<|message|>**Action plan**: 1. Do X 2. Do Y<|end|><|start|>assistant<|channel|>commentary to=functions.x<|constrain|>json<|message|>{"template": "basic_html", "path": "index.html"}<|call|>
+    Handles tool calls in the format:
+    <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
     """
     def __init__(self):
         super().__init__()
+        self.harmony_parser = HarmonyParser()
         self.bot_token = "<|start|>assistant<|channel|>commentary"
         self.eot_token = "<|call|>"
-        # TODO: no clear indication how parallel tool call response format is
-        self.tool_call_separator = ""
-        # Pattern for complete function calls with to= parameter
-        # Handles both <|call|> and <|call|>commentary endings
-        # Also handles optional <|start|>assistant prefix and whitespace after function name
-        self.function_call_pattern = re.compile(
-            r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
-            r"<\|constrain\|>json<\|message\|>(.*?)<\|call\|>(?:commentary)?",
-            re.DOTALL,
-        )
-        # Pattern for streaming function calls (incomplete)
-        # Also handles optional whitespace after function name
-        self.streaming_pattern = re.compile(
-            r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
-            r"<\|constrain\|>json<\|message\|>(.*)",
-            re.DOTALL,
-        )
-        # Pattern for commentary with action plan (no to= parameter)
-        self.commentary_pattern = re.compile(
-            r"<\|channel\|>commentary<\|message\|>(.*?)<\|end\|>",
+        # Pattern to extract function name and JSON from tool_call event content
+        self.tool_extract_pattern = re.compile(
+            r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
             re.DOTALL,
         )
-        self._last_arguments = ""
     def has_tool_call(self, text: str) -> bool:
         """Check if text contains TypeScript-style function call markers."""
         return self.bot_token in text
@@ -73,259 +44,176 @@ class GptOssDetector(BaseFormatDetector):
         if not self.has_tool_call(text):
             return StreamingParseResult(normal_text=text, calls=[])
-        tool_indices = self._get_tool_indices(tools)
+        # Parse with HarmonyParser
+        events = self.harmony_parser.parse(text)
+        # Flush buffer for complete parsing
+        events += self.harmony_parser.parse("")
+        tool_indices = self._get_tool_indices(tools)
         calls = []
+        normal_parts = []
         tool_index = 0
-        # Process the entire text to handle mixed commentary and tool calls
-        normal_text_parts = []
-        # Find all commentary sections (both with and without to=)
-        all_commentary_pattern = re.compile(
-            r"<\|channel\|>commentary(?:\s+to=[^<]*)?<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
-            re.DOTALL,
-        )
-        # Track processed positions to avoid double-processing
-        processed_ranges = []
-        # First, extract all tool calls
-        for match in self.function_call_pattern.finditer(text):
-            full_function_name = match.group(1)
-            args_content = match.group(2)
-            processed_ranges.append((match.start(), match.end()))
-            function_name = (
-                full_function_name.split(".")[-1]
-                if "." in full_function_name
-                else full_function_name
-            )
-            try:
-                arguments = json.loads(args_content) if args_content.strip() else {}
-            except json.JSONDecodeError:
-                continue
-            if function_name in tool_indices:
-                calls.append(
-                    ToolCallItem(
-                        tool_index=tool_index,
-                        name=function_name,
-                        parameters=json.dumps(arguments, ensure_ascii=False),
-                    )
+        for event in events:
+            if event.event_type == "tool_call":
+                # Extract tool call from event content
+                tool_call = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    tool_indices,
+                    tool_index,
                 )
-                tool_index += 1
-        # Then, find non-tool-call commentary sections for normal text
-        for match in all_commentary_pattern.finditer(text):
-            # Check if this match overlaps with any processed tool call
-            match_start, match_end = match.start(), match.end()
-            is_tool_call = any(
-                start <= match_start < end or start < match_end <= end
-                for start, end in processed_ranges
-            )
-            # If this commentary is not part of a tool call, include it in normal text
-            if not is_tool_call:
-                content = match.group(1).strip()
-                if content:
-                    normal_text_parts.append(content)
-        # Handle remaining text after all matches
-        if processed_ranges:
-            last_match_end = max(end for _, end in processed_ranges)
-            if last_match_end < len(text):
-                remaining_text = text[last_match_end:]
-            # Clean up <|start|>assistant prefixes and extract final content
-            # Remove standalone <|start|>assistant prefixes
-            remaining_text = re.sub(r"<\|start\|>assistant(?!\w)", "", remaining_text)
-            # Extract content from final channel if present
-            final_pattern = re.compile(
-                r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", re.DOTALL
-            )
-            final_match = final_pattern.search(remaining_text)
-            if final_match:
-                # Get everything before final channel + final channel content
-                before_final = remaining_text[: final_match.start()].strip()
-                final_content = final_match.group(1).strip()
+                if tool_call:
+                    calls.append(tool_call)
+                    tool_index += 1
+            elif event.event_type == "normal":
+                normal_parts.append(event.content)
+            # Ignore reasoning events in function call context
-                parts = []
-                if before_final:
-                    parts.append(before_final)
-                if final_content:
-                    parts.append(final_content)
-                remaining_text = " ".join(parts) if parts else ""
-            remaining_text = remaining_text.strip()
-            if remaining_text:
-                normal_text_parts.append(remaining_text)
-        # Combine all normal text parts
-        final_normal_text = " ".join(part for part in normal_text_parts if part).strip()
-        return StreamingParseResult(normal_text=final_normal_text, calls=calls)
+        normal_text = " ".join(normal_parts).strip()
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
     def parse_streaming_increment(
         self, new_text: str, tools: List[Tool]
     ) -> StreamingParseResult:
         """Parse incremental streaming text for TypeScript-style function calls."""
         self._buffer += new_text
-        current_text = self._buffer
-        # Check if we have a tool call
-        has_tool_call = "<|channel|>commentary to=" in current_text
-        if not has_tool_call and current_text:
-            # Check for commentary without function calls
-            commentary_match = self.commentary_pattern.search(current_text)
-            if commentary_match:
-                commentary_content = commentary_match.group(1)
-                self._buffer = current_text[commentary_match.end() :]
-                return StreamingParseResult(normal_text=commentary_content, calls=[])
-            # Check for final channel content
-            final_pattern = re.compile(
-                r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
-                re.DOTALL,
+        # Always use HarmonyParser for parsing to ensure proper filtering
+        events = self.harmony_parser.parse(new_text)
+        # Quick check if we might have tool calls
+        if (
+            "<|channel|>commentary to=" not in self._buffer
+            and not self.current_tool_name_sent
+        ):
+            # No tool calls detected, check for final content
+            if (
+                "<|channel|>final" in self._buffer
+                or "assistantfinal" in self._buffer.lower()
+            ):
+                # Extract normal text from events
+                normal_text = "".join(
+                    [e.content for e in events if e.event_type == "normal"]
+                )
+                if normal_text:
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text, calls=[])
+            # For other content, extract normal text from events (with filtering applied)
+            normal_text = "".join(
+                [e.content for e in events if e.event_type == "normal"]
             )
-            final_match = final_pattern.search(current_text)
-            if final_match:
-                final_content = final_match.group(1).strip()
+            if normal_text or events:
                 self._buffer = ""
-                return StreamingParseResult(normal_text=final_content, calls=[])
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+            else:
+                # No events processed, continue buffering
+                return StreamingParseResult(normal_text="", calls=[])
-            self._buffer = ""
-            return StreamingParseResult(normal_text=new_text, calls=[])
+        if not events:
+            # No complete events yet
+            return StreamingParseResult(normal_text="", calls=[])
+        # Initialize state if needed
         if not hasattr(self, "_tool_indices"):
             self._tool_indices = self._get_tool_indices(tools)
         calls = []
-        try:
-            # Check for streaming function call
-            match = self.streaming_pattern.search(current_text)
-            if match:
-                full_function_name = match.group(1)
-                args_content = match.group(2)
-                function_name = (
-                    full_function_name.split(".")[-1]
-                    if "." in full_function_name
-                    else full_function_name
+        normal_text = ""
+        for event in events:
+            if event.event_type == "tool_call":
+                # We got a complete tool call from HarmonyParser
+                tool_call_info = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    self._tool_indices,
+                    self.current_tool_id if self.current_tool_id >= 0 else 0,
                 )
-                # Initialize state if this is the first tool call
-                if self.current_tool_id == -1:
-                    self.current_tool_id = 0
-                    self.prev_tool_call_arr = []
-                    self.streamed_args_for_tool = [""]
-                # Ensure we have enough entries in tracking arrays
-                while len(self.prev_tool_call_arr) <= self.current_tool_id:
-                    self.prev_tool_call_arr.append({})
-                while len(self.streamed_args_for_tool) <= self.current_tool_id:
-                    self.streamed_args_for_tool.append("")
-                if not self.current_tool_name_sent:
-                    calls.append(
-                        ToolCallItem(
-                            tool_index=self.current_tool_id,
-                            name=function_name,
-                            parameters="",
-                        )
-                    )
-                    self.current_tool_name_sent = True
-                    # Store the tool call info
+                if tool_call_info:
+                    # Initialize state if first tool
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                        self.prev_tool_call_arr = []
+                        self.streamed_args_for_tool = [""]
+                    # Ensure arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")
+                    # Store tool call info
                     self.prev_tool_call_arr[self.current_tool_id] = {
-                        "name": function_name,
-                        "arguments": {},
+                        "name": tool_call_info.name,
+                        "arguments": json.loads(tool_call_info.parameters),
                     }
-                    self.streamed_args_for_tool[self.current_tool_id] = ""
-                # Check if we have a complete function call
-                complete_match = self.function_call_pattern.search(current_text)
-                if complete_match:
-                    args_content = complete_match.group(2)
-                    try:
-                        parsed_args = json.loads(args_content)
-                        self.prev_tool_call_arr[self.current_tool_id][
-                            "arguments"
-                        ] = parsed_args
-                        # Send complete arguments if we haven't sent them yet
-                        if not self.streamed_args_for_tool[self.current_tool_id]:
-                            # Send the complete arguments as JSON string
-                            calls.append(
-                                ToolCallItem(
-                                    tool_index=self.current_tool_id,
-                                    name=None,
-                                    parameters=json.dumps(
-                                        parsed_args, ensure_ascii=False
-                                    ),
-                                )
-                            )
-                            self.streamed_args_for_tool[self.current_tool_id] = (
-                                json.dumps(parsed_args, ensure_ascii=False)
-                            )
-                    except json.JSONDecodeError:
-                        pass
-                    # Remove the completed function call from buffer
-                    remaining_after_call = current_text[complete_match.end() :]
-                    # Clean up <|start|>assistant prefixes and extract final content
-                    remaining_after_call = re.sub(
-                        r"<\|start\|>assistant(?!\w)", "", remaining_after_call
-                    )
-                    # Extract content from final channel if present
-                    final_pattern = re.compile(
-                        r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
-                        re.DOTALL,
+                    # Emit the complete tool call at once
+                    # (Could be modified to emit name first, then args, if needed)
+                    calls.append(tool_call_info)
+                    # Mark as streamed
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        tool_call_info.parameters
                     )
-                    final_match = final_pattern.search(remaining_after_call)
-                    if final_match:
-                        before_final = remaining_after_call[
-                            : final_match.start()
-                        ].strip()
-                        final_content = final_match.group(1).strip()
+                    # Move to next tool
+                    self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+            elif event.event_type == "normal":
+                normal_text += event.content
-                        parts = []
-                        if before_final:
-                            parts.append(before_final)
-                        if final_content:
-                            parts.append(final_content)
-                        remaining_after_call = " ".join(parts) if parts else ""
+        # Clear buffer since HarmonyParser handles buffering
+        self._buffer = ""
-                    self._buffer = remaining_after_call.strip()
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
-                    # Reset state for next tool call
-                    self.current_tool_name_sent = False
-                    self.current_tool_id += 1
+    def _extract_tool_call_from_event(
+        self, content: str, tool_indices: dict, tool_index: int
+    ) -> Optional[ToolCallItem]:
+        """
+        Extract tool call information from HarmonyParser event content.
-                    # Return final content if available
-                    final_text = ""
-                    if final_match and final_content:
-                        final_text = final_content
-                    elif remaining_after_call:
-                        final_text = remaining_after_call
+        Content format: "commentary to=functions.get_weather<|constrain|>json<|message|>{...}"
+        """
+        match = self.tool_extract_pattern.search(content)
-                    return StreamingParseResult(normal_text=final_text, calls=calls)
+        if not match:
+            logger.debug(f"Could not extract tool call from: {content[:100]}")
+            return None
-            return StreamingParseResult(normal_text="", calls=calls)
+        full_function_name = match.group(1)
+        json_content = match.group(2)
-        except Exception as e:
-            logger.error(f"Error in parse_streaming_increment: {e}")
-            return StreamingParseResult(normal_text=current_text, calls=[])
+        # Extract function name (last part after .)
+        function_name = (
+            full_function_name.split(".")[-1]
+            if "." in full_function_name
+            else full_function_name
+        )
+        # Check if tool exists
+        if function_name not in tool_indices:
+            logger.debug(f"Function {function_name} not in available tools")
+            return None
+        # Parse JSON arguments
+        try:
+            arguments = json.loads(json_content) if json_content.strip() else {}
+        except json.JSONDecodeError as e:
+            logger.debug(f"Failed to parse JSON arguments: {e}")
+            return None
+        return ToolCallItem(
+            tool_index=tool_index,
+            name=function_name,
+            parameters=json.dumps(arguments, ensure_ascii=False),
+        )
     def structure_info(self) -> _GetInfoFunc:
-        raise NotImplementedError()
+        raise NotImplementedError("structure_info not used with HarmonyParser")
     def build_ebnf(self, tools: List[Tool]) -> str:
-        raise NotImplementedError()
+        raise NotImplementedError("build_ebnf not used with HarmonyParser")

sglang/srt/function_call/qwen3_coder_detector.py CHANGED Viewed

@@ -358,5 +358,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
             function_format="xml",
             call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
             key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
-            key_value_separator="\\n",
+            key_value_separator='"\\n"',
         )

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -40,7 +40,9 @@ from sglang.srt.configs import (
     DeepseekVL2Config,
     ExaoneConfig,
     KimiVLConfig,
+    LongcatFlashConfig,
     MultiModalityConfig,
+    Qwen3NextConfig,
     Step3VLConfig,
 )
 from sglang.srt.configs.internvl import InternVLChatConfig
@@ -56,6 +58,8 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     KimiVLConfig.model_type: KimiVLConfig,
     InternVLChatConfig.model_type: InternVLChatConfig,
     Step3VLConfig.model_type: Step3VLConfig,
+    LongcatFlashConfig.model_type: LongcatFlashConfig,
+    Qwen3NextConfig.model_type: Qwen3NextConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():
@@ -126,6 +130,14 @@ def get_config(
         kwargs["gguf_file"] = model
         model = Path(model).parent
+    if is_remote_url(model):
+        # BaseConnector implements __del__() to clean up the local dir.
+        # Since config files need to exist all the time, so we DO NOT use
+        # with statement to avoid closing the client.
+        client = create_remote_connector(model)
+        client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+        model = client.get_local_dir()
     config = AutoConfig.from_pretrained(
         model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
     )
@@ -368,13 +380,22 @@ def get_processor(
     if config.model_type not in {"llava", "clip"}:
         kwargs["use_fast"] = use_fast
     try:
-        processor = AutoProcessor.from_pretrained(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            revision=revision,
-            **kwargs,
-        )
+        if "InternVL3_5" in tokenizer_name:
+            processor = AutoTokenizer.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+        else:
+            processor = AutoProcessor.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
     except ValueError as e:
         error_message = str(e)

sglang/srt/layers/activation.py CHANGED Viewed

@@ -35,6 +35,7 @@ from sglang.srt.utils import (
     is_cuda,
     is_hip,
     is_npu,
+    is_xpu,
     set_weight_attrs,
 )
 from sglang.utils import resolve_obj_by_qualname
@@ -44,8 +45,9 @@ _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
 _is_hip = is_hip()
+_is_xpu = is_xpu()
-if _is_cuda:
+if _is_cuda or _is_xpu:
     from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 elif _is_hip:
     from sgl_kernel import gelu_and_mul, gelu_quick, gelu_tanh_and_mul, silu_and_mul
@@ -70,8 +72,6 @@ class SiluAndMul(CustomOp):
     def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
         if _is_cpu_amx_available:
-            d = x.shape[-1] // 2
-            output_shape = x.shape[:-1] + (d,)
             out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
             return out
         else:
@@ -81,17 +81,20 @@ class SiluAndMul(CustomOp):
         out = torch_npu.npu_swiglu(x)
         return out
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out
 class GeluAndMul(CustomOp):
     def __init__(self, approximate="tanh"):
         super().__init__()
         self.approximate = approximate
-    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
-    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+    def _forward_impl(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -103,6 +106,33 @@ class GeluAndMul(CustomOp):
             raise RuntimeError("GeluAndMul only support tanh or none")
         return out
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+    def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
+        if _is_cpu_amx_available and self.approximate == "tanh":
+            return torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
+        elif _is_cpu_amx_available and self.approximate == "none":
+            return torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
+        else:
+            return self.forward_native(x)
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self._forward_impl(x)
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        return self._forward_impl(x)
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        y_npu, gelu_npu = torch_npu.npu_geglu(
+            x,
+            dim=-1,
+            approximate=1 if self.approximate == "tanh" else 0,
+            activate_left=True,
+        )
+        return y_npu
 class NewGELU(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -137,6 +167,9 @@ class QuickGELU(CustomOp):
         gelu_quick(x, out)
         return out
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_fast_gelu(x)
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
@@ -230,7 +263,9 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
         return nn.Identity()
-if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip):
+if not (
+    _is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip or _is_xpu
+):
     logger.info(
         "sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries."
     )

sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

sglang 0.5.1.post2py3-none-any.whl → 0.5.2py3-none-any.whl