PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/eplb/expert_distribution.py CHANGED Viewed

@@ -16,21 +16,20 @@ from __future__ import annotations
 import logging
 import math
-import os
 import time
 from abc import ABC
 from collections import deque
 from contextlib import contextmanager
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
 import einops
 import torch
 import torch.distributed
+from sglang.srt.environ import envs
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import Withable, get_bool_env_var, is_npu
+from sglang.srt.utils import Withable, is_npu
 _is_npu = is_npu()
@@ -416,10 +415,19 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
     def collect(self) -> Dict:
         num_tokens = len(self._metadata["input_ids"])
+        global_physical_count = _convert_per_token_to_global_physical_count(
+            num_tokens,
+            num_layers=self._expert_location_metadata.num_layers,
+            num_physical_experts=self._expert_location_metadata.num_physical_experts,
+            _topk_ids_of_layer=self._topk_ids_of_layer,
+        )
         return dict(
             **self._metadata,
             topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
             misc_objects=self._misc_objects,
+            global_physical_count=global_physical_count,
         )
@@ -548,6 +556,27 @@ class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
         self._data[layer_idx, :] += local_physical_count_of_layer
+def _convert_per_token_to_global_physical_count(
+    num_tokens: int,
+    num_layers: int,
+    num_physical_experts: int,
+    _topk_ids_of_layer: torch.Tensor,
+) -> torch.Tensor:
+    topk_ids_layer_major = _topk_ids_of_layer[:, :num_tokens, :].reshape(num_layers, -1)
+    mask = topk_ids_layer_major != -1
+    index = topk_ids_layer_major.masked_fill(~mask, 0).long()
+    src = mask.int()
+    ans = torch.zeros(
+        (num_layers, num_physical_experts),
+        dtype=_topk_ids_of_layer.dtype,
+        device=_topk_ids_of_layer.device,
+    )
+    ans.scatter_add_(dim=1, index=index, src=src)
+    return ans
 def _convert_local_to_global_physical_count(
     local_physical_count: torch.Tensor,
     rank: int,
@@ -839,7 +868,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
 def _dump_to_file(name, data):
-    save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
+    save_dir = envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get()
     path_output = save_dir / name
     logger.info(f"Write expert distribution to {path_output}")
     if not save_dir.exists():

sglang/srt/eplb/expert_location_dispatch.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Literal, Optional
 import torch
 from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.server_args import get_global_server_args
 @dataclass
@@ -34,7 +34,7 @@ class ExpertLocationDispatchInfo:
     @classmethod
     def init_new(cls, layer_id: int):
-        ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
+        ep_dispatch_algorithm = get_global_server_args().ep_dispatch_algorithm
         expert_location_metadata = get_global_expert_location_metadata()
         assert expert_location_metadata is not None

sglang/srt/eplb/expert_location_updater.py CHANGED Viewed

@@ -24,7 +24,7 @@ from sglang.srt.eplb.expert_location import (
     ExpertLocationMetadata,
     get_global_expert_location_metadata,
 )
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -97,7 +97,7 @@ def _update_expert_weights_with_canary(
         canary_tensor = (
             _get_canary_value(old_expert_location_metadata, layer_id)
             .clone()
-            .to(device=global_server_args_dict["device"], non_blocking=True)
+            .to(device=get_global_server_args().device, non_blocking=True)
         )
         routed_experts_weights_of_layer[layer_id].append(canary_tensor)

sglang/srt/function_call/base_format_detector.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
+import orjson
 from partial_json_parser.core.exceptions import MalformedJSON
 from partial_json_parser.core.options import Allow
@@ -96,7 +97,7 @@ class BaseFormatDetector(ABC):
         Parses the text in one go. Returns success=True if the format matches, otherwise False.
         Note that leftover_text here represents "content that this parser will not consume further".
         """
-        action = json.loads(text)
+        action = orjson.loads(text)
         return StreamingParseResult(calls=self.parse_base_json(action, tools))
     def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
@@ -264,12 +265,6 @@ class BaseFormatDetector(ABC):
                         # Only remove the processed portion, keep unprocessed content
                         self._buffer = current_text[start_idx + end_idx :]
-                        if self.current_tool_id < len(self.prev_tool_call_arr):
-                            self.prev_tool_call_arr[self.current_tool_id].clear()
-                        self.current_tool_name_sent = False
-                        self.streamed_args_for_tool[self.current_tool_id] = ""
-                        self.current_tool_id += 1
                     # If the tool is still being parsed, send incremental changes
                     elif prev_arguments:
                         prev_args_json = json.dumps(prev_arguments)
@@ -277,6 +272,20 @@ class BaseFormatDetector(ABC):
                             prefix = _find_common_prefix(prev_args_json, cur_args_json)
                             argument_diff = prefix[sent:]
+                    # Update prev_tool_call_arr with current state
+                    if self.current_tool_id >= 0:
+                        # Ensure prev_tool_call_arr is large enough
+                        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                            self.prev_tool_call_arr.append({})
+                        self.prev_tool_call_arr[self.current_tool_id] = (
+                            current_tool_call
+                        )
+                    # Advance to next tool if complete
+                    if is_current_complete:
+                        self.current_tool_name_sent = False
+                        self.current_tool_id += 1
                     # Send the argument diff if there's something new
                     if argument_diff is not None:
                         # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
@@ -293,17 +302,7 @@ class BaseFormatDetector(ABC):
                                 )
                             ],
                         )
-                        if not is_current_complete:
-                            self.streamed_args_for_tool[
-                                self.current_tool_id
-                            ] += argument_diff
-            # Update prev_tool_call_arr with current state
-            if self.current_tool_id >= 0:
-                # Ensure prev_tool_call_arr is large enough
-                while len(self.prev_tool_call_arr) <= self.current_tool_id:
-                    self.prev_tool_call_arr.append({})
-                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
+                        self.streamed_args_for_tool[tool_index_to_use] += argument_diff
             return res

sglang/srt/function_call/function_call_parser.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import logging
-from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
+from typing import Dict, List, Literal, Optional, Set, Tuple, Type, Union
 from sglang.srt.entrypoints.openai.protocol import (
-    StructuralTagResponseFormat,
+    LegacyStructuralTagResponseFormat,
     StructuresResponseFormat,
     Tool,
+    ToolCallConstraint,
     ToolChoice,
 )
 from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -15,6 +16,7 @@ from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
 from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
 from sglang.srt.function_call.kimik2_detector import KimiK2Detector
 from sglang.srt.function_call.llama32_detector import Llama32Detector
+from sglang.srt.function_call.minimax_m2 import MinimaxM2Detector
 from sglang.srt.function_call.mistral_detector import MistralDetector
 from sglang.srt.function_call.pythonic_detector import PythonicDetector
 from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
@@ -35,21 +37,23 @@ class FunctionCallParser:
     """
     ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
-        "llama3": Llama32Detector,
-        "qwen25": Qwen25Detector,
-        "mistral": MistralDetector,
         "deepseekv3": DeepSeekV3Detector,
         "deepseekv31": DeepSeekV31Detector,
-        "pythonic": PythonicDetector,
+        "glm": Glm4MoeDetector,
+        "glm45": Glm4MoeDetector,
+        "gpt-oss": GptOssDetector,
         "kimi_k2": KimiK2Detector,
+        "llama3": Llama32Detector,
+        "mistral": MistralDetector,
+        "pythonic": PythonicDetector,
+        "qwen": Qwen25Detector,
+        "qwen25": Qwen25Detector,
         "qwen3_coder": Qwen3CoderDetector,
-        "glm45": Glm4MoeDetector,
         "step3": Step3Detector,
-        "gpt-oss": GptOssDetector,
+        "minimax-m2": MinimaxM2Detector,
     }
     def __init__(self, tools: List[Tool], tool_call_parser: str):
-        detector: Type[BaseFormatDetector] = None
         detector_class = self.ToolCallParserEnum.get(tool_call_parser)
         if detector_class:
             detector = detector_class()
@@ -121,7 +125,7 @@ class FunctionCallParser:
         return final_normal_text, final_calls
-    def get_structure_tag(self) -> StructuralTagResponseFormat:
+    def get_structure_tag(self) -> LegacyStructuralTagResponseFormat:
         """
         Generate a structural tag response format for all available tools.
@@ -149,7 +153,9 @@ class FunctionCallParser:
             )
             tool_trigger_set.add(info.trigger)
-        return StructuralTagResponseFormat(
+        # TODO(dark): move this into new structural tag format
+        # This requires all grammar backend support the new format
+        return LegacyStructuralTagResponseFormat(
             type="structural_tag",
             structures=tool_structures,
             triggers=list(tool_trigger_set),
@@ -157,7 +163,7 @@ class FunctionCallParser:
     def get_structure_constraint(
         self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
-    ) -> Optional[Tuple[str, Any]]:
+    ) -> Optional[ToolCallConstraint]:
         """
         Returns the appropriate structure constraint for tool calls based on the tool_choice.
         The constraint is used to guide the model's output format.
@@ -176,8 +182,8 @@ class FunctionCallParser:
             and tool_choice == "auto"
             and any(tool.function.strict for tool in self.tools)
         ):
-            strict_tag = self.get_structure_tag()
-            return ("structural_tag", strict_tag)
+            tag = self.get_structure_tag()
+            return ("structural_tag", tag)
         elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
             json_schema = get_json_schema_constraint(self.tools, tool_choice)
             return ("json_schema", json_schema)

sglang/srt/function_call/glm4_moe_detector.py CHANGED Viewed

@@ -6,11 +6,7 @@ from typing import List
 from sglang.srt.entrypoints.openai.protocol import Tool
 from sglang.srt.function_call.base_format_detector import BaseFormatDetector
-from sglang.srt.function_call.core_types import (
-    StreamingParseResult,
-    StructureInfo,
-    _GetInfoFunc,
-)
+from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
 from sglang.srt.function_call.ebnf_composer import EBNFComposer
 logger = logging.getLogger(__name__)

sglang/srt/function_call/gpt_oss_detector.py CHANGED Viewed

@@ -31,7 +31,7 @@ class GptOssDetector(BaseFormatDetector):
         # Pattern to extract function name and JSON from tool_call event content
         self.tool_extract_pattern = re.compile(
-            r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
+            r"to=([a-zA-Z_][a-zA-Z0-9_.-]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
             re.DOTALL,
         )

sglang/srt/function_call/json_array_parser.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import json
-import re
 from typing import List
 from sglang.srt.entrypoints.openai.protocol import Tool

sglang/srt/function_call/minimax_m2.py ADDED Viewed

@@ -0,0 +1,367 @@
+import ast
+import html
+import json
+import logging
+import re
+from typing import Any, Dict, List, Tuple
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+logger = logging.getLogger(__name__)
+def _safe_val(raw: str) -> Any:
+    raw = html.unescape(raw.strip())
+    try:
+        return json.loads(raw)
+    except Exception:
+        try:
+            return ast.literal_eval(raw)
+        except Exception:
+            return raw
+class MinimaxM2Detector(BaseFormatDetector):
+    """
+    Detector for MiniMax M2 models.
+    Assumes function call format:
+        <minimax:tool_call>
+        <invoke name="func1">
+        <parameter name="param1">value1</parameter>
+        <parameter name="param2">value2</parameter>
+        </invoke>
+        </minimax:tool_call>
+    """
+    def __init__(self):
+        super().__init__()
+        self.tool_call_start_token: str = "<minimax:tool_call>"
+        self.tool_call_end_token: str = "</minimax:tool_call>"
+        self.tool_call_prefix: str = '<invoke name="'
+        self.tool_call_function_end_token: str = "</invoke>"
+        self.tool_call_regex = re.compile(
+            r"<minimax:tool_call>(.*?)</minimax:tool_call>|<minimax:tool_call>(.*?)$",
+            re.DOTALL,
+        )
+        self.tool_call_function_regex = re.compile(
+            r"<invoke name=\"(.*?)</invoke>|<invoke name=\"(.*)$", re.DOTALL
+        )
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter name=\"(.*?)</parameter>|<parameter name=\"(.*?)$", re.DOTALL
+        )
+        self._buf: str = ""
+        # Streaming state variables
+        self._current_function_name: str = ""
+        self._current_parameters: Dict[str, Any] = {}
+        self._streamed_parameters: Dict[str, str] = (
+            {}
+        )  # Track what parameter content we've streamed
+        self._in_tool_call: bool = False
+        self._function_name_sent: bool = False
+    def has_tool_call(self, text: str) -> bool:
+        return self.tool_call_start_token in text
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        normal, calls = self._extract(text, tools)
+        return StreamingParseResult(normal_text=normal, calls=calls)
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        self._buf += new_text
+        normal = ""
+        calls: List[ToolCallItem] = []
+        # Build tool indices for validation
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+        while True:
+            # If we're not in a tool call and don't see a start token, return normal text
+            if not self._in_tool_call and self.tool_call_start_token not in self._buf:
+                normal += self._buf
+                self._buf = ""
+                break
+            # Look for tool call start
+            if not self._in_tool_call:
+                s = self._buf.find(self.tool_call_start_token)
+                if s == -1:
+                    normal += self._buf
+                    self._buf = ""
+                    break
+                normal += self._buf[:s]
+                self._buf = self._buf[s:]
+                self._in_tool_call = True
+                self._function_name_sent = False
+                self._current_function_name = ""
+                self._current_parameters = {}
+                self._streamed_parameters = {}
+                # Remove the start token
+                self._buf = self._buf[len(self.tool_call_start_token) :]
+                continue
+            # We're in a tool call, try to parse function name if not sent yet
+            if not self._function_name_sent:
+                # Look for function name pattern: <invoke name=name>
+                function_match = re.search(r"<invoke name=\"([^>]+)\">", self._buf)
+                if function_match:
+                    function_name = function_match.group(1).strip()
+                    # Validate function name
+                    if function_name in self._tool_indices:
+                        self._current_function_name = function_name
+                        self._function_name_sent = True
+                        # Initialize tool call tracking
+                        if self.current_tool_id == -1:
+                            self.current_tool_id = 0
+                        # Ensure tracking arrays are large enough
+                        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                            self.prev_tool_call_arr.append({})
+                        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                            self.streamed_args_for_tool.append("")
+                        # Store tool call info
+                        self.prev_tool_call_arr[self.current_tool_id] = {
+                            "name": function_name,
+                            "arguments": {},
+                        }
+                        # Send tool name with empty parameters
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=function_name,
+                                parameters="",
+                            )
+                        )
+                        # Remove the processed function declaration
+                        self._buf = self._buf[function_match.end() :]
+                        continue
+                    else:
+                        # Invalid function name, reset state
+                        logger.warning(f"Invalid function name: {function_name}")
+                        self._reset_streaming_state()
+                        normal += self._buf
+                        self._buf = ""
+                        break
+                else:
+                    # Function name not complete yet, wait for more text
+                    break
+            # Parse parameters incrementally
+            if self._function_name_sent:
+                # Process parameters and get any calls to emit
+                parameter_calls = self._parse_and_stream_parameters(self._buf)
+                calls.extend(parameter_calls)
+                # Check if tool call is complete
+                if self.tool_call_function_end_token in self._buf:
+                    end_pos = self._buf.find(self.tool_call_function_end_token)
+                    # Add closing brace to complete the JSON object
+                    current_streamed = self.streamed_args_for_tool[self.current_tool_id]
+                    if current_streamed:
+                        # Count opening and closing braces to check if JSON is complete
+                        open_braces = current_streamed.count("{")
+                        close_braces = current_streamed.count("}")
+                        if open_braces > close_braces:
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id,
+                                    name=None,
+                                    parameters="}",
+                                )
+                            )
+                            self.streamed_args_for_tool[self.current_tool_id] = (
+                                current_streamed + "}"
+                            )
+                    # Complete the tool call
+                    self._buf = self._buf[
+                        end_pos + len(self.tool_call_function_end_token) :
+                    ]
+                    self._reset_streaming_state(True)
+                    self.current_tool_id += 1
+                    continue
+                else:
+                    # Tool call not complete yet, wait for more text
+                    break
+        return StreamingParseResult(normal_text=normal, calls=calls)
+    def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
+        """
+        Parse complete parameter blocks from text and return any tool call items to emit.
+        This method:
+        1. Finds all complete <parameter> blocks
+        2. Parses them into a dictionary
+        3. Compares with current parameters and generates diff if needed
+        4. Updates internal state
+        Args:
+            text_to_parse: The text to search for parameter blocks
+        Returns:
+            List of ToolCallItem objects to emit (may be empty)
+        """
+        calls: List[ToolCallItem] = []
+        # Find all complete parameter patterns
+        param_matches = list(
+            re.finditer(
+                r"<parameter name=\"([^>]+)\">(.*?)</parameter>",
+                text_to_parse,
+                re.DOTALL,
+            )
+        )
+        # Build new parameters dictionary
+        new_params = {}
+        for match in param_matches:
+            param_name = match.group(1).strip()
+            param_value = match.group(2)
+            new_params[param_name] = _safe_val(param_value)
+        # Calculate parameter diff to stream with proper incremental JSON building
+        if new_params != self._current_parameters:
+            previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
+            # Build incremental JSON properly
+            if not self._current_parameters:
+                # First parameter(s) - start JSON object but don't close it yet
+                items = []
+                for key, value in new_params.items():
+                    items.append(
+                        f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
+                    )
+                json_fragment = "{" + ", ".join(items)
+                calls.append(
+                    ToolCallItem(
+                        tool_index=self.current_tool_id,
+                        name=None,
+                        parameters=json_fragment,
+                    )
+                )
+                self.streamed_args_for_tool[self.current_tool_id] = json_fragment
+            else:
+                # Additional parameters - add them incrementally
+                new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
+                if new_keys:
+                    # Build the continuation part (no closing brace yet)
+                    continuation_parts = []
+                    for key in new_keys:
+                        value = new_params[key]
+                        continuation_parts.append(
+                            f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
+                        )
+                    json_fragment = ", " + ", ".join(continuation_parts)
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=None,
+                            parameters=json_fragment,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        previous_args_json + json_fragment
+                    )
+            # Update current state
+            self._current_parameters = new_params
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
+        return calls
+    def _reset_streaming_state(self, still_in_tool_call: bool = False):
+        """Reset streaming state for the next tool call"""
+        self._in_tool_call = still_in_tool_call
+        self._function_name_sent = False
+        self._current_function_name = ""
+        self._current_parameters = {}
+        self._streamed_parameters = {}
+        self.current_tool_name_sent = False
+    def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
+        normal_parts: List[str] = []
+        calls: List[ToolCallItem] = []
+        cursor = 0
+        while True:
+            s = text.find(self.tool_call_start_token, cursor)
+            if s == -1:
+                normal_parts.append(text[cursor:])
+                break
+            normal_parts.append(text[cursor:s])
+            e = text.find(self.tool_call_end_token, s)
+            if e == -1:
+                normal_parts.append(text[s:])
+                break
+            block = text[s : e + len(self.tool_call_end_token)]
+            cursor = e + len(self.tool_call_end_token)
+            calls.extend(self._parse_block(block, tools))
+        return "".join(normal_parts), calls
+    def _parse_block(self, block: str, tools: List[Tool]) -> List[ToolCallItem]:
+        res: List[ToolCallItem] = []
+        for m in self.tool_call_function_regex.findall(block):
+            txt = m[0] if m[0] else m[1]
+            if '">' not in txt:
+                continue
+            idx = txt.index('">')
+            fname = txt[:idx].strip()
+            body = txt[idx + 2 :]
+            params: Dict[str, Any] = {}
+            for pm in self.tool_call_parameter_regex.findall(body):
+                ptxt = pm[0] if pm[0] else pm[1]
+                if '">' not in ptxt:
+                    continue
+                pidx = ptxt.index('">')
+                pname = ptxt[:pidx].strip()
+                pval = ptxt[pidx + 2 :].lstrip("\n").rstrip("\n")
+                params[pname] = _safe_val(pval)
+            raw = {"name": fname, "arguments": params}
+            try:
+                # TODO: fix idx in function call, the index for a function
+                # call will always be -1 in parse_base_json
+                res.extend(self.parse_base_json(raw, tools))
+            except Exception:
+                logger.warning("invalid tool call for %s dropped", fname)
+        return res
+    def supports_structural_tag(self) -> bool:
+        return False
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            individual_call_start_token=self.tool_call_start_token.replace("\n", "\\n"),
+            individual_call_end_token=self.tool_call_end_token.replace("\n", "\\n"),
+            tool_call_separator="\\n",
+            function_format="xml",
+            call_rule_fmt='"<invoke name=\\"{name}\\">\\n" {arguments_rule} "\\n</invoke>"',
+            key_value_rule_fmt='"<parameter name=\\"{key}\\">\\n" {valrule} "\\n</parameter>"',
+            key_value_separator='"\\n"',
+        )

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl