sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +89 -54
  3. sglang/bench_serving.py +437 -40
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/profiler.py +0 -1
  6. sglang/srt/configs/__init__.py +4 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/longcat_flash.py +104 -0
  9. sglang/srt/configs/model_config.py +37 -7
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +1 -1
  12. sglang/srt/connector/base_connector.py +1 -2
  13. sglang/srt/connector/redis.py +2 -2
  14. sglang/srt/connector/serde/__init__.py +1 -1
  15. sglang/srt/connector/serde/safe_serde.py +4 -3
  16. sglang/srt/custom_op.py +11 -1
  17. sglang/srt/debug_utils/dump_comparator.py +81 -44
  18. sglang/srt/debug_utils/dump_loader.py +97 -0
  19. sglang/srt/debug_utils/dumper.py +11 -3
  20. sglang/srt/debug_utils/text_comparator.py +73 -11
  21. sglang/srt/disaggregation/ascend/conn.py +75 -0
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +6 -4
  25. sglang/srt/disaggregation/fake/conn.py +1 -1
  26. sglang/srt/disaggregation/mini_lb.py +6 -420
  27. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  28. sglang/srt/disaggregation/nixl/conn.py +180 -16
  29. sglang/srt/disaggregation/prefill.py +6 -4
  30. sglang/srt/disaggregation/utils.py +5 -50
  31. sglang/srt/distributed/parallel_state.py +94 -58
  32. sglang/srt/entrypoints/engine.py +34 -14
  33. sglang/srt/entrypoints/http_server.py +172 -47
  34. sglang/srt/entrypoints/openai/protocol.py +90 -27
  35. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  36. sglang/srt/entrypoints/openai/serving_chat.py +82 -26
  37. sglang/srt/entrypoints/openai/serving_completions.py +25 -4
  38. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  39. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  40. sglang/srt/eplb/eplb_manager.py +28 -4
  41. sglang/srt/eplb/expert_distribution.py +55 -15
  42. sglang/srt/eplb/expert_location.py +8 -3
  43. sglang/srt/eplb/expert_location_updater.py +1 -1
  44. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  45. sglang/srt/function_call/ebnf_composer.py +11 -9
  46. sglang/srt/function_call/function_call_parser.py +2 -0
  47. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  48. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  49. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  50. sglang/srt/hf_transformers_utils.py +28 -7
  51. sglang/srt/layers/activation.py +44 -9
  52. sglang/srt/layers/attention/aiter_backend.py +93 -68
  53. sglang/srt/layers/attention/ascend_backend.py +381 -136
  54. sglang/srt/layers/attention/fla/chunk.py +242 -0
  55. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  56. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  57. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  58. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  59. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  60. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  61. sglang/srt/layers/attention/fla/index.py +37 -0
  62. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  63. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  64. sglang/srt/layers/attention/fla/op.py +66 -0
  65. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  66. sglang/srt/layers/attention/fla/utils.py +331 -0
  67. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  68. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  69. sglang/srt/layers/attention/flashinfer_backend.py +11 -6
  70. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
  71. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  72. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  73. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  74. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  75. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  76. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  77. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  78. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  79. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  80. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  81. sglang/srt/layers/communicator.py +45 -8
  82. sglang/srt/layers/layernorm.py +54 -12
  83. sglang/srt/layers/logits_processor.py +10 -3
  84. sglang/srt/layers/moe/__init__.py +2 -1
  85. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  86. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  87. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  88. sglang/srt/layers/moe/ep_moe/layer.py +111 -56
  89. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  90. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  94. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  100. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  101. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  102. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  103. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  104. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  105. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  106. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  107. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  108. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  109. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  110. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  111. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  112. sglang/srt/layers/moe/topk.py +43 -12
  113. sglang/srt/layers/moe/utils.py +6 -5
  114. sglang/srt/layers/quantization/awq.py +19 -7
  115. sglang/srt/layers/quantization/base_config.py +11 -6
  116. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  119. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  121. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
  122. sglang/srt/layers/quantization/fp8.py +78 -48
  123. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  124. sglang/srt/layers/quantization/fp8_utils.py +45 -31
  125. sglang/srt/layers/quantization/gptq.py +25 -17
  126. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  127. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  128. sglang/srt/layers/quantization/mxfp4.py +93 -68
  129. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  130. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  131. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  132. sglang/srt/layers/quantization/quark/utils.py +97 -0
  133. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  134. sglang/srt/layers/quantization/unquant.py +135 -47
  135. sglang/srt/layers/quantization/utils.py +13 -0
  136. sglang/srt/layers/quantization/w4afp8.py +60 -42
  137. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  138. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  139. sglang/srt/layers/rocm_linear_utils.py +44 -0
  140. sglang/srt/layers/rotary_embedding.py +28 -19
  141. sglang/srt/layers/sampler.py +29 -5
  142. sglang/srt/layers/utils.py +0 -14
  143. sglang/srt/lora/backend/base_backend.py +50 -8
  144. sglang/srt/lora/backend/triton_backend.py +90 -2
  145. sglang/srt/lora/layers.py +32 -0
  146. sglang/srt/lora/lora.py +4 -1
  147. sglang/srt/lora/lora_manager.py +35 -112
  148. sglang/srt/lora/mem_pool.py +24 -10
  149. sglang/srt/lora/utils.py +18 -9
  150. sglang/srt/managers/cache_controller.py +396 -365
  151. sglang/srt/managers/data_parallel_controller.py +30 -15
  152. sglang/srt/managers/detokenizer_manager.py +18 -2
  153. sglang/srt/managers/disagg_service.py +46 -0
  154. sglang/srt/managers/io_struct.py +190 -11
  155. sglang/srt/managers/mm_utils.py +6 -1
  156. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  157. sglang/srt/managers/schedule_batch.py +27 -44
  158. sglang/srt/managers/schedule_policy.py +4 -3
  159. sglang/srt/managers/scheduler.py +148 -122
  160. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  161. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  162. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  163. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  164. sglang/srt/managers/template_manager.py +3 -3
  165. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  166. sglang/srt/managers/tokenizer_manager.py +77 -480
  167. sglang/srt/managers/tp_worker.py +16 -4
  168. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  169. sglang/srt/mem_cache/allocator.py +1 -1
  170. sglang/srt/mem_cache/chunk_cache.py +1 -1
  171. sglang/srt/mem_cache/hicache_storage.py +53 -40
  172. sglang/srt/mem_cache/hiradix_cache.py +196 -104
  173. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  174. sglang/srt/mem_cache/memory_pool.py +395 -53
  175. sglang/srt/mem_cache/memory_pool_host.py +27 -19
  176. sglang/srt/mem_cache/radix_cache.py +6 -6
  177. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  178. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  179. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  180. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  181. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
  182. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  183. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  184. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
  185. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  186. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  187. sglang/srt/metrics/collector.py +484 -63
  188. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  189. sglang/srt/metrics/utils.py +48 -0
  190. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  191. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  192. sglang/srt/model_executor/forward_batch_info.py +72 -18
  193. sglang/srt/model_executor/model_runner.py +190 -32
  194. sglang/srt/model_loader/__init__.py +9 -3
  195. sglang/srt/model_loader/loader.py +33 -28
  196. sglang/srt/model_loader/utils.py +12 -0
  197. sglang/srt/model_loader/weight_utils.py +2 -1
  198. sglang/srt/models/deepseek_v2.py +323 -53
  199. sglang/srt/models/gemma3n_mm.py +1 -1
  200. sglang/srt/models/glm4_moe.py +10 -1
  201. sglang/srt/models/glm4v.py +4 -2
  202. sglang/srt/models/gpt_oss.py +7 -19
  203. sglang/srt/models/internvl.py +28 -0
  204. sglang/srt/models/llama4.py +9 -0
  205. sglang/srt/models/llama_eagle3.py +17 -0
  206. sglang/srt/models/longcat_flash.py +1026 -0
  207. sglang/srt/models/longcat_flash_nextn.py +699 -0
  208. sglang/srt/models/minicpmv.py +165 -3
  209. sglang/srt/models/mllama4.py +25 -0
  210. sglang/srt/models/opt.py +637 -0
  211. sglang/srt/models/qwen2.py +33 -3
  212. sglang/srt/models/qwen2_5_vl.py +91 -42
  213. sglang/srt/models/qwen2_moe.py +79 -14
  214. sglang/srt/models/qwen3.py +8 -2
  215. sglang/srt/models/qwen3_moe.py +39 -8
  216. sglang/srt/models/qwen3_next.py +1039 -0
  217. sglang/srt/models/qwen3_next_mtp.py +109 -0
  218. sglang/srt/models/torch_native_llama.py +1 -1
  219. sglang/srt/models/transformers.py +1 -1
  220. sglang/srt/multimodal/processors/base_processor.py +4 -2
  221. sglang/srt/multimodal/processors/glm4v.py +9 -9
  222. sglang/srt/multimodal/processors/internvl.py +141 -129
  223. sglang/srt/{conversation.py → parser/conversation.py} +38 -5
  224. sglang/srt/parser/harmony_parser.py +588 -0
  225. sglang/srt/parser/reasoning_parser.py +309 -0
  226. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  227. sglang/srt/sampling/sampling_batch_info.py +18 -15
  228. sglang/srt/server_args.py +307 -80
  229. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  230. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  231. sglang/srt/speculative/eagle_worker.py +216 -120
  232. sglang/srt/speculative/spec_info.py +5 -0
  233. sglang/srt/speculative/standalone_worker.py +109 -0
  234. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  235. sglang/srt/utils.py +96 -7
  236. sglang/srt/weight_sync/utils.py +1 -1
  237. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  238. sglang/test/few_shot_gsm8k.py +1 -0
  239. sglang/test/runners.py +4 -0
  240. sglang/test/test_cutlass_moe.py +24 -6
  241. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  242. sglang/test/test_disaggregation_utils.py +66 -0
  243. sglang/test/test_utils.py +25 -1
  244. sglang/utils.py +5 -0
  245. sglang/version.py +1 -1
  246. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
  247. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
  248. sglang/srt/disaggregation/launch_lb.py +0 -131
  249. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  250. sglang/srt/reasoning_parser.py +0 -553
  251. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  252. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  253. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  254. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  255. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  256. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
- from typing import List
4
+ from typing import List, Optional
5
5
 
6
6
  from sglang.srt.entrypoints.openai.protocol import Tool
7
7
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -10,60 +10,31 @@ from sglang.srt.function_call.core_types import (
10
10
  ToolCallItem,
11
11
  _GetInfoFunc,
12
12
  )
13
+ from sglang.srt.parser.harmony_parser import HarmonyParser
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
16
17
 
17
18
  class GptOssDetector(BaseFormatDetector):
18
19
  """
19
- Detector for T4-style function calls with channel format.
20
+ Detector for T4-style function calls using HarmonyParser.
20
21
 
21
- Supports two formats:
22
- 1. Direct function call: <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
23
- 2. Commentary with action plan: <|channel|>commentary<|message|>{content}<|end|>
24
-
25
- For parallel function calls, each call is self-contained and starts with its own channel:
26
- <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"SF"}<|call|>
27
- <|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"SF attractions"}<|call|>
28
-
29
- Examples:
30
- Single: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"San Francisco"}<|call|>commentary
31
- Multiple: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"Paris"}<|call|>commentary<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"Paris tourism"}<|call|>
32
- With Action Plan: <|channel|>commentary<|message|>**Action plan**: 1. Do X 2. Do Y<|end|><|start|>assistant<|channel|>commentary to=functions.x<|constrain|>json<|message|>{"template": "basic_html", "path": "index.html"}<|call|>
22
+ Handles tool calls in the format:
23
+ <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
33
24
  """
34
25
 
35
26
  def __init__(self):
36
27
  super().__init__()
28
+ self.harmony_parser = HarmonyParser()
37
29
  self.bot_token = "<|start|>assistant<|channel|>commentary"
38
30
  self.eot_token = "<|call|>"
39
- # TODO: no clear indication how parallel tool call response format is
40
- self.tool_call_separator = ""
41
-
42
- # Pattern for complete function calls with to= parameter
43
- # Handles both <|call|> and <|call|>commentary endings
44
- # Also handles optional <|start|>assistant prefix and whitespace after function name
45
- self.function_call_pattern = re.compile(
46
- r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
47
- r"<\|constrain\|>json<\|message\|>(.*?)<\|call\|>(?:commentary)?",
48
- re.DOTALL,
49
- )
50
-
51
- # Pattern for streaming function calls (incomplete)
52
- # Also handles optional whitespace after function name
53
- self.streaming_pattern = re.compile(
54
- r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
55
- r"<\|constrain\|>json<\|message\|>(.*)",
56
- re.DOTALL,
57
- )
58
31
 
59
- # Pattern for commentary with action plan (no to= parameter)
60
- self.commentary_pattern = re.compile(
61
- r"<\|channel\|>commentary<\|message\|>(.*?)<\|end\|>",
32
+ # Pattern to extract function name and JSON from tool_call event content
33
+ self.tool_extract_pattern = re.compile(
34
+ r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
62
35
  re.DOTALL,
63
36
  )
64
37
 
65
- self._last_arguments = ""
66
-
67
38
  def has_tool_call(self, text: str) -> bool:
68
39
  """Check if text contains TypeScript-style function call markers."""
69
40
  return self.bot_token in text
@@ -73,259 +44,176 @@ class GptOssDetector(BaseFormatDetector):
73
44
  if not self.has_tool_call(text):
74
45
  return StreamingParseResult(normal_text=text, calls=[])
75
46
 
76
- tool_indices = self._get_tool_indices(tools)
47
+ # Parse with HarmonyParser
48
+ events = self.harmony_parser.parse(text)
49
+ # Flush buffer for complete parsing
50
+ events += self.harmony_parser.parse("")
77
51
 
52
+ tool_indices = self._get_tool_indices(tools)
78
53
  calls = []
54
+ normal_parts = []
79
55
  tool_index = 0
80
56
 
81
- # Process the entire text to handle mixed commentary and tool calls
82
- normal_text_parts = []
83
-
84
- # Find all commentary sections (both with and without to=)
85
- all_commentary_pattern = re.compile(
86
- r"<\|channel\|>commentary(?:\s+to=[^<]*)?<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
87
- re.DOTALL,
88
- )
89
-
90
- # Track processed positions to avoid double-processing
91
- processed_ranges = []
92
-
93
- # First, extract all tool calls
94
- for match in self.function_call_pattern.finditer(text):
95
- full_function_name = match.group(1)
96
- args_content = match.group(2)
97
- processed_ranges.append((match.start(), match.end()))
98
-
99
- function_name = (
100
- full_function_name.split(".")[-1]
101
- if "." in full_function_name
102
- else full_function_name
103
- )
104
-
105
- try:
106
- arguments = json.loads(args_content) if args_content.strip() else {}
107
- except json.JSONDecodeError:
108
- continue
109
-
110
- if function_name in tool_indices:
111
- calls.append(
112
- ToolCallItem(
113
- tool_index=tool_index,
114
- name=function_name,
115
- parameters=json.dumps(arguments, ensure_ascii=False),
116
- )
57
+ for event in events:
58
+ if event.event_type == "tool_call":
59
+ # Extract tool call from event content
60
+ tool_call = self._extract_tool_call_from_event(
61
+ event.raw_text if event.raw_text else event.content,
62
+ tool_indices,
63
+ tool_index,
117
64
  )
118
- tool_index += 1
119
-
120
- # Then, find non-tool-call commentary sections for normal text
121
- for match in all_commentary_pattern.finditer(text):
122
- # Check if this match overlaps with any processed tool call
123
- match_start, match_end = match.start(), match.end()
124
- is_tool_call = any(
125
- start <= match_start < end or start < match_end <= end
126
- for start, end in processed_ranges
127
- )
128
-
129
- # If this commentary is not part of a tool call, include it in normal text
130
- if not is_tool_call:
131
- content = match.group(1).strip()
132
- if content:
133
- normal_text_parts.append(content)
134
-
135
- # Handle remaining text after all matches
136
- if processed_ranges:
137
- last_match_end = max(end for _, end in processed_ranges)
138
- if last_match_end < len(text):
139
- remaining_text = text[last_match_end:]
140
-
141
- # Clean up <|start|>assistant prefixes and extract final content
142
- # Remove standalone <|start|>assistant prefixes
143
- remaining_text = re.sub(r"<\|start\|>assistant(?!\w)", "", remaining_text)
144
-
145
- # Extract content from final channel if present
146
- final_pattern = re.compile(
147
- r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", re.DOTALL
148
- )
149
- final_match = final_pattern.search(remaining_text)
150
-
151
- if final_match:
152
- # Get everything before final channel + final channel content
153
- before_final = remaining_text[: final_match.start()].strip()
154
- final_content = final_match.group(1).strip()
65
+ if tool_call:
66
+ calls.append(tool_call)
67
+ tool_index += 1
68
+ elif event.event_type == "normal":
69
+ normal_parts.append(event.content)
70
+ # Ignore reasoning events in function call context
155
71
 
156
- parts = []
157
- if before_final:
158
- parts.append(before_final)
159
- if final_content:
160
- parts.append(final_content)
161
- remaining_text = " ".join(parts) if parts else ""
162
-
163
- remaining_text = remaining_text.strip()
164
-
165
- if remaining_text:
166
- normal_text_parts.append(remaining_text)
167
-
168
- # Combine all normal text parts
169
- final_normal_text = " ".join(part for part in normal_text_parts if part).strip()
170
- return StreamingParseResult(normal_text=final_normal_text, calls=calls)
72
+ normal_text = " ".join(normal_parts).strip()
73
+ return StreamingParseResult(normal_text=normal_text, calls=calls)
171
74
 
172
75
  def parse_streaming_increment(
173
76
  self, new_text: str, tools: List[Tool]
174
77
  ) -> StreamingParseResult:
175
78
  """Parse incremental streaming text for TypeScript-style function calls."""
176
79
  self._buffer += new_text
177
- current_text = self._buffer
178
-
179
- # Check if we have a tool call
180
- has_tool_call = "<|channel|>commentary to=" in current_text
181
-
182
- if not has_tool_call and current_text:
183
- # Check for commentary without function calls
184
- commentary_match = self.commentary_pattern.search(current_text)
185
- if commentary_match:
186
- commentary_content = commentary_match.group(1)
187
- self._buffer = current_text[commentary_match.end() :]
188
- return StreamingParseResult(normal_text=commentary_content, calls=[])
189
-
190
- # Check for final channel content
191
- final_pattern = re.compile(
192
- r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
193
- re.DOTALL,
80
+
81
+ # Always use HarmonyParser for parsing to ensure proper filtering
82
+ events = self.harmony_parser.parse(new_text)
83
+
84
+ # Quick check if we might have tool calls
85
+ if (
86
+ "<|channel|>commentary to=" not in self._buffer
87
+ and not self.current_tool_name_sent
88
+ ):
89
+ # No tool calls detected, check for final content
90
+ if (
91
+ "<|channel|>final" in self._buffer
92
+ or "assistantfinal" in self._buffer.lower()
93
+ ):
94
+ # Extract normal text from events
95
+ normal_text = "".join(
96
+ [e.content for e in events if e.event_type == "normal"]
97
+ )
98
+ if normal_text:
99
+ self._buffer = ""
100
+ return StreamingParseResult(normal_text=normal_text, calls=[])
101
+
102
+ # For other content, extract normal text from events (with filtering applied)
103
+ normal_text = "".join(
104
+ [e.content for e in events if e.event_type == "normal"]
194
105
  )
195
- final_match = final_pattern.search(current_text)
196
- if final_match:
197
- final_content = final_match.group(1).strip()
106
+ if normal_text or events:
198
107
  self._buffer = ""
199
- return StreamingParseResult(normal_text=final_content, calls=[])
108
+ return StreamingParseResult(normal_text=normal_text, calls=[])
109
+ else:
110
+ # No events processed, continue buffering
111
+ return StreamingParseResult(normal_text="", calls=[])
200
112
 
201
- self._buffer = ""
202
- return StreamingParseResult(normal_text=new_text, calls=[])
113
+ if not events:
114
+ # No complete events yet
115
+ return StreamingParseResult(normal_text="", calls=[])
203
116
 
117
+ # Initialize state if needed
204
118
  if not hasattr(self, "_tool_indices"):
205
119
  self._tool_indices = self._get_tool_indices(tools)
206
120
 
207
121
  calls = []
208
- try:
209
- # Check for streaming function call
210
- match = self.streaming_pattern.search(current_text)
211
- if match:
212
- full_function_name = match.group(1)
213
- args_content = match.group(2)
214
-
215
- function_name = (
216
- full_function_name.split(".")[-1]
217
- if "." in full_function_name
218
- else full_function_name
122
+ normal_text = ""
123
+
124
+ for event in events:
125
+ if event.event_type == "tool_call":
126
+ # We got a complete tool call from HarmonyParser
127
+ tool_call_info = self._extract_tool_call_from_event(
128
+ event.raw_text if event.raw_text else event.content,
129
+ self._tool_indices,
130
+ self.current_tool_id if self.current_tool_id >= 0 else 0,
219
131
  )
220
132
 
221
- # Initialize state if this is the first tool call
222
- if self.current_tool_id == -1:
223
- self.current_tool_id = 0
224
- self.prev_tool_call_arr = []
225
- self.streamed_args_for_tool = [""]
226
-
227
- # Ensure we have enough entries in tracking arrays
228
- while len(self.prev_tool_call_arr) <= self.current_tool_id:
229
- self.prev_tool_call_arr.append({})
230
- while len(self.streamed_args_for_tool) <= self.current_tool_id:
231
- self.streamed_args_for_tool.append("")
232
-
233
- if not self.current_tool_name_sent:
234
- calls.append(
235
- ToolCallItem(
236
- tool_index=self.current_tool_id,
237
- name=function_name,
238
- parameters="",
239
- )
240
- )
241
- self.current_tool_name_sent = True
242
- # Store the tool call info
133
+ if tool_call_info:
134
+ # Initialize state if first tool
135
+ if self.current_tool_id == -1:
136
+ self.current_tool_id = 0
137
+ self.prev_tool_call_arr = []
138
+ self.streamed_args_for_tool = [""]
139
+
140
+ # Ensure arrays are large enough
141
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
142
+ self.prev_tool_call_arr.append({})
143
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
144
+ self.streamed_args_for_tool.append("")
145
+
146
+ # Store tool call info
243
147
  self.prev_tool_call_arr[self.current_tool_id] = {
244
- "name": function_name,
245
- "arguments": {},
148
+ "name": tool_call_info.name,
149
+ "arguments": json.loads(tool_call_info.parameters),
246
150
  }
247
- self.streamed_args_for_tool[self.current_tool_id] = ""
248
-
249
- # Check if we have a complete function call
250
- complete_match = self.function_call_pattern.search(current_text)
251
- if complete_match:
252
- args_content = complete_match.group(2)
253
-
254
- try:
255
- parsed_args = json.loads(args_content)
256
- self.prev_tool_call_arr[self.current_tool_id][
257
- "arguments"
258
- ] = parsed_args
259
-
260
- # Send complete arguments if we haven't sent them yet
261
- if not self.streamed_args_for_tool[self.current_tool_id]:
262
- # Send the complete arguments as JSON string
263
- calls.append(
264
- ToolCallItem(
265
- tool_index=self.current_tool_id,
266
- name=None,
267
- parameters=json.dumps(
268
- parsed_args, ensure_ascii=False
269
- ),
270
- )
271
- )
272
- self.streamed_args_for_tool[self.current_tool_id] = (
273
- json.dumps(parsed_args, ensure_ascii=False)
274
- )
275
- except json.JSONDecodeError:
276
- pass
277
-
278
- # Remove the completed function call from buffer
279
- remaining_after_call = current_text[complete_match.end() :]
280
-
281
- # Clean up <|start|>assistant prefixes and extract final content
282
- remaining_after_call = re.sub(
283
- r"<\|start\|>assistant(?!\w)", "", remaining_after_call
284
- )
285
151
 
286
- # Extract content from final channel if present
287
- final_pattern = re.compile(
288
- r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
289
- re.DOTALL,
152
+ # Emit the complete tool call at once
153
+ # (Could be modified to emit name first, then args, if needed)
154
+ calls.append(tool_call_info)
155
+
156
+ # Mark as streamed
157
+ self.streamed_args_for_tool[self.current_tool_id] = (
158
+ tool_call_info.parameters
290
159
  )
291
- final_match = final_pattern.search(remaining_after_call)
292
160
 
293
- if final_match:
294
- before_final = remaining_after_call[
295
- : final_match.start()
296
- ].strip()
297
- final_content = final_match.group(1).strip()
161
+ # Move to next tool
162
+ self.current_tool_id += 1
163
+ self.current_tool_name_sent = False
164
+
165
+ elif event.event_type == "normal":
166
+ normal_text += event.content
298
167
 
299
- parts = []
300
- if before_final:
301
- parts.append(before_final)
302
- if final_content:
303
- parts.append(final_content)
304
- remaining_after_call = " ".join(parts) if parts else ""
168
+ # Clear buffer since HarmonyParser handles buffering
169
+ self._buffer = ""
305
170
 
306
- self._buffer = remaining_after_call.strip()
171
+ return StreamingParseResult(normal_text=normal_text, calls=calls)
307
172
 
308
- # Reset state for next tool call
309
- self.current_tool_name_sent = False
310
- self.current_tool_id += 1
173
+ def _extract_tool_call_from_event(
174
+ self, content: str, tool_indices: dict, tool_index: int
175
+ ) -> Optional[ToolCallItem]:
176
+ """
177
+ Extract tool call information from HarmonyParser event content.
311
178
 
312
- # Return final content if available
313
- final_text = ""
314
- if final_match and final_content:
315
- final_text = final_content
316
- elif remaining_after_call:
317
- final_text = remaining_after_call
179
+ Content format: "commentary to=functions.get_weather<|constrain|>json<|message|>{...}"
180
+ """
181
+ match = self.tool_extract_pattern.search(content)
318
182
 
319
- return StreamingParseResult(normal_text=final_text, calls=calls)
183
+ if not match:
184
+ logger.debug(f"Could not extract tool call from: {content[:100]}")
185
+ return None
320
186
 
321
- return StreamingParseResult(normal_text="", calls=calls)
187
+ full_function_name = match.group(1)
188
+ json_content = match.group(2)
322
189
 
323
- except Exception as e:
324
- logger.error(f"Error in parse_streaming_increment: {e}")
325
- return StreamingParseResult(normal_text=current_text, calls=[])
190
+ # Extract function name (last part after .)
191
+ function_name = (
192
+ full_function_name.split(".")[-1]
193
+ if "." in full_function_name
194
+ else full_function_name
195
+ )
196
+
197
+ # Check if tool exists
198
+ if function_name not in tool_indices:
199
+ logger.debug(f"Function {function_name} not in available tools")
200
+ return None
201
+
202
+ # Parse JSON arguments
203
+ try:
204
+ arguments = json.loads(json_content) if json_content.strip() else {}
205
+ except json.JSONDecodeError as e:
206
+ logger.debug(f"Failed to parse JSON arguments: {e}")
207
+ return None
208
+
209
+ return ToolCallItem(
210
+ tool_index=tool_index,
211
+ name=function_name,
212
+ parameters=json.dumps(arguments, ensure_ascii=False),
213
+ )
326
214
 
327
215
  def structure_info(self) -> _GetInfoFunc:
328
- raise NotImplementedError()
216
+ raise NotImplementedError("structure_info not used with HarmonyParser")
329
217
 
330
218
  def build_ebnf(self, tools: List[Tool]) -> str:
331
- raise NotImplementedError()
219
+ raise NotImplementedError("build_ebnf not used with HarmonyParser")
@@ -358,5 +358,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
358
358
  function_format="xml",
359
359
  call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
360
360
  key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
361
- key_value_separator="\\n",
361
+ key_value_separator='"\\n"',
362
362
  )
@@ -40,7 +40,9 @@ from sglang.srt.configs import (
40
40
  DeepseekVL2Config,
41
41
  ExaoneConfig,
42
42
  KimiVLConfig,
43
+ LongcatFlashConfig,
43
44
  MultiModalityConfig,
45
+ Qwen3NextConfig,
44
46
  Step3VLConfig,
45
47
  )
46
48
  from sglang.srt.configs.internvl import InternVLChatConfig
@@ -56,6 +58,8 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
56
58
  KimiVLConfig.model_type: KimiVLConfig,
57
59
  InternVLChatConfig.model_type: InternVLChatConfig,
58
60
  Step3VLConfig.model_type: Step3VLConfig,
61
+ LongcatFlashConfig.model_type: LongcatFlashConfig,
62
+ Qwen3NextConfig.model_type: Qwen3NextConfig,
59
63
  }
60
64
 
61
65
  for name, cls in _CONFIG_REGISTRY.items():
@@ -126,6 +130,14 @@ def get_config(
126
130
  kwargs["gguf_file"] = model
127
131
  model = Path(model).parent
128
132
 
133
+ if is_remote_url(model):
134
+ # BaseConnector implements __del__() to clean up the local dir.
135
+ # Since config files need to exist all the time, so we DO NOT use
136
+ # with statement to avoid closing the client.
137
+ client = create_remote_connector(model)
138
+ client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
139
+ model = client.get_local_dir()
140
+
129
141
  config = AutoConfig.from_pretrained(
130
142
  model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
131
143
  )
@@ -368,13 +380,22 @@ def get_processor(
368
380
  if config.model_type not in {"llava", "clip"}:
369
381
  kwargs["use_fast"] = use_fast
370
382
  try:
371
- processor = AutoProcessor.from_pretrained(
372
- tokenizer_name,
373
- *args,
374
- trust_remote_code=trust_remote_code,
375
- revision=revision,
376
- **kwargs,
377
- )
383
+ if "InternVL3_5" in tokenizer_name:
384
+ processor = AutoTokenizer.from_pretrained(
385
+ tokenizer_name,
386
+ *args,
387
+ trust_remote_code=trust_remote_code,
388
+ revision=revision,
389
+ **kwargs,
390
+ )
391
+ else:
392
+ processor = AutoProcessor.from_pretrained(
393
+ tokenizer_name,
394
+ *args,
395
+ trust_remote_code=trust_remote_code,
396
+ revision=revision,
397
+ **kwargs,
398
+ )
378
399
 
379
400
  except ValueError as e:
380
401
  error_message = str(e)
@@ -35,6 +35,7 @@ from sglang.srt.utils import (
35
35
  is_cuda,
36
36
  is_hip,
37
37
  is_npu,
38
+ is_xpu,
38
39
  set_weight_attrs,
39
40
  )
40
41
  from sglang.utils import resolve_obj_by_qualname
@@ -44,8 +45,9 @@ _is_npu = is_npu()
44
45
  _is_cpu_amx_available = cpu_has_amx_support()
45
46
  _is_cpu = is_cpu()
46
47
  _is_hip = is_hip()
48
+ _is_xpu = is_xpu()
47
49
 
48
- if _is_cuda:
50
+ if _is_cuda or _is_xpu:
49
51
  from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
50
52
  elif _is_hip:
51
53
  from sgl_kernel import gelu_and_mul, gelu_quick, gelu_tanh_and_mul, silu_and_mul
@@ -70,8 +72,6 @@ class SiluAndMul(CustomOp):
70
72
 
71
73
  def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
72
74
  if _is_cpu_amx_available:
73
- d = x.shape[-1] // 2
74
- output_shape = x.shape[:-1] + (d,)
75
75
  out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
76
76
  return out
77
77
  else:
@@ -81,17 +81,20 @@ class SiluAndMul(CustomOp):
81
81
  out = torch_npu.npu_swiglu(x)
82
82
  return out
83
83
 
84
+ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
85
+ d = x.shape[-1] // 2
86
+ output_shape = x.shape[:-1] + (d,)
87
+ out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
88
+ silu_and_mul(x, out)
89
+ return out
90
+
84
91
 
85
92
  class GeluAndMul(CustomOp):
86
93
  def __init__(self, approximate="tanh"):
87
94
  super().__init__()
88
95
  self.approximate = approximate
89
96
 
90
- def forward_native(self, x: torch.Tensor) -> torch.Tensor:
91
- d = x.shape[-1] // 2
92
- return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
93
-
94
- def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
97
+ def _forward_impl(self, x: torch.Tensor) -> torch.Tensor:
95
98
  d = x.shape[-1] // 2
96
99
  output_shape = x.shape[:-1] + (d,)
97
100
  out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -103,6 +106,33 @@ class GeluAndMul(CustomOp):
103
106
  raise RuntimeError("GeluAndMul only support tanh or none")
104
107
  return out
105
108
 
109
+ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
110
+ d = x.shape[-1] // 2
111
+ return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
112
+
113
+ def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
114
+ if _is_cpu_amx_available and self.approximate == "tanh":
115
+ return torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
116
+ elif _is_cpu_amx_available and self.approximate == "none":
117
+ return torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
118
+ else:
119
+ return self.forward_native(x)
120
+
121
+ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
122
+ return self._forward_impl(x)
123
+
124
+ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
125
+ return self._forward_impl(x)
126
+
127
+ def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
128
+ y_npu, gelu_npu = torch_npu.npu_geglu(
129
+ x,
130
+ dim=-1,
131
+ approximate=1 if self.approximate == "tanh" else 0,
132
+ activate_left=True,
133
+ )
134
+ return y_npu
135
+
106
136
 
107
137
  class NewGELU(CustomOp):
108
138
  def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -137,6 +167,9 @@ class QuickGELU(CustomOp):
137
167
  gelu_quick(x, out)
138
168
  return out
139
169
 
170
+ def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
171
+ return torch_npu.npu_fast_gelu(x)
172
+
140
173
 
141
174
  class ScaledActivation(nn.Module):
142
175
  """An activation function with post-scale parameters.
@@ -230,7 +263,9 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
230
263
  return nn.Identity()
231
264
 
232
265
 
233
- if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip):
266
+ if not (
267
+ _is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip or _is_xpu
268
+ ):
234
269
  logger.info(
235
270
  "sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries."
236
271
  )