sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -11,21 +11,26 @@
|
|
11
11
|
# See the License for the specific language governing permissions and
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
14
17
|
import json
|
15
18
|
import logging
|
16
19
|
import random
|
17
20
|
from dataclasses import dataclass
|
18
21
|
from pathlib import Path
|
19
|
-
from typing import List, Optional
|
22
|
+
from typing import TYPE_CHECKING, List, Optional
|
20
23
|
|
21
24
|
import torch
|
22
25
|
import torch.distributed
|
23
26
|
import torch.nn.functional as F
|
24
27
|
|
25
|
-
from sglang.srt.configs.model_config import ModelConfig
|
26
28
|
from sglang.srt.eplb import eplb_algorithms
|
27
29
|
from sglang.srt.model_loader import get_model_architecture
|
28
|
-
|
30
|
+
|
31
|
+
if TYPE_CHECKING:
|
32
|
+
from sglang.srt.configs.model_config import ModelConfig
|
33
|
+
from sglang.srt.server_args import ServerArgs
|
29
34
|
|
30
35
|
logger = logging.getLogger(__name__)
|
31
36
|
|
@@ -226,6 +231,7 @@ class ExpertLocationMetadata:
|
|
226
231
|
logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
|
227
232
|
logical_to_rank_dispatch_physical_map=(
|
228
233
|
compute_logical_to_rank_dispatch_physical_map(
|
234
|
+
server_args=server_args,
|
229
235
|
logical_to_all_physical_map=logical_to_all_physical_map,
|
230
236
|
num_gpus=ep_size,
|
231
237
|
num_physical_experts=num_physical_experts,
|
@@ -335,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
|
|
335
341
|
|
336
342
|
# TODO optimize performance (rewrite and/or run in separate process with overlap)
|
337
343
|
def compute_logical_to_rank_dispatch_physical_map(
|
344
|
+
server_args: ServerArgs,
|
338
345
|
logical_to_all_physical_map: torch.Tensor,
|
339
346
|
num_gpus: int,
|
340
347
|
num_physical_experts: int,
|
@@ -343,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
|
|
343
350
|
):
|
344
351
|
r = random.Random(seed)
|
345
352
|
|
346
|
-
|
353
|
+
num_local_gpu_physical_experts = num_physical_experts // num_gpus
|
354
|
+
num_gpus_per_node = server_args.ep_size // server_args.nnodes
|
355
|
+
num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
|
347
356
|
num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
|
348
357
|
dtype = logical_to_all_physical_map.dtype
|
349
358
|
|
@@ -367,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
|
|
367
376
|
physical_expert_id
|
368
377
|
for physical_expert_id in candidate_physical_expert_ids
|
369
378
|
if _compute_gpu_id_of_physical_expert(
|
370
|
-
physical_expert_id,
|
379
|
+
physical_expert_id, num_local_gpu_physical_experts
|
371
380
|
)
|
372
381
|
== gpu_id
|
373
382
|
]
|
374
383
|
if len(same_gpu_physical_expert_ids) > 0:
|
384
|
+
# 1. Prefer same-GPU experts
|
375
385
|
output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
|
376
|
-
|
386
|
+
else:
|
387
|
+
# 2. Otherwise, prefer same-node experts
|
388
|
+
node_id = gpu_id // num_gpus_per_node
|
389
|
+
same_node_physical_expert_ids = [
|
390
|
+
physical_expert_id
|
391
|
+
for physical_expert_id in candidate_physical_expert_ids
|
392
|
+
if _compute_node_id_of_physical_expert(
|
393
|
+
physical_expert_id, num_local_node_physical_experts
|
394
|
+
)
|
395
|
+
== node_id
|
396
|
+
]
|
397
|
+
if len(same_node_physical_expert_ids) > 0:
|
398
|
+
output_partial[gpu_id] = same_node_physical_expert_ids[0]
|
399
|
+
|
400
|
+
# 3. Fill remaining slots with fair random choices
|
377
401
|
num_remain = torch.sum(output_partial == -1).item()
|
378
402
|
output_partial[output_partial == -1] = torch.tensor(
|
379
403
|
_fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
|
@@ -399,9 +423,15 @@ def _logical_to_all_physical_raw(
|
|
399
423
|
|
400
424
|
|
401
425
|
def _compute_gpu_id_of_physical_expert(
|
402
|
-
physical_expert_id: int,
|
426
|
+
physical_expert_id: int, num_local_gpu_physical_experts: int
|
427
|
+
) -> int:
|
428
|
+
return physical_expert_id // num_local_gpu_physical_experts
|
429
|
+
|
430
|
+
|
431
|
+
def _compute_node_id_of_physical_expert(
|
432
|
+
physical_expert_id: int, num_local_host_physical_experts: int
|
403
433
|
) -> int:
|
404
|
-
return physical_expert_id //
|
434
|
+
return physical_expert_id // num_local_host_physical_experts
|
405
435
|
|
406
436
|
|
407
437
|
def _fair_choices(arr: List, k: int, r: random.Random) -> List:
|
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
|
|
47
47
|
):
|
48
48
|
if self._first_execution:
|
49
49
|
self._first_execution = False
|
50
|
-
torch.
|
50
|
+
torch.get_device_module().empty_cache()
|
51
51
|
|
52
52
|
old_expert_location_metadata = get_global_expert_location_metadata()
|
53
53
|
assert old_expert_location_metadata is not None
|
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
|
|
162
162
|
|
163
163
|
try:
|
164
164
|
try:
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
self.tool_call_separator + self.bot_token
|
169
|
-
):
|
170
|
-
start_idx = len(self.tool_call_separator + self.bot_token)
|
165
|
+
tool_call_pos = current_text.find(self.bot_token)
|
166
|
+
if tool_call_pos != -1:
|
167
|
+
start_idx = tool_call_pos + len(self.bot_token)
|
171
168
|
elif self.current_tool_id > 0 and current_text.startswith(
|
172
169
|
self.tool_call_separator
|
173
170
|
):
|
@@ -50,19 +50,19 @@ class EBNFComposer:
|
|
50
50
|
|
51
51
|
CALL_RULE_MAP = {
|
52
52
|
"pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
|
53
|
-
"json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ",
|
53
|
+
"json": 'call_{name} ::= "{{" ws "\\"name\\"" ws ":" ws "\\"{name}\\"" ws "," ws "\\"arguments\\"" ws ":" ws {arguments_rule} ws "}}"',
|
54
54
|
"xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
55
55
|
}
|
56
56
|
|
57
57
|
ARGUMENTS_RULE_MAP = {
|
58
58
|
"pythonic": "{arg_rules}",
|
59
|
-
"json": '"{{" {arg_rules} "}}"',
|
59
|
+
"json": '"{{" ws {arg_rules} ws "}}"',
|
60
60
|
"xml": "{arg_rules}",
|
61
61
|
}
|
62
62
|
|
63
63
|
KEY_VALUE_RULE_MAP = {
|
64
64
|
"pythonic": '"{key}" "=" {valrule}',
|
65
|
-
"json": '"\\"{key}\\"" ":" {valrule}',
|
65
|
+
"json": '"\\"{key}\\"" ws ":" ws {valrule}',
|
66
66
|
"xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
67
67
|
}
|
68
68
|
|
@@ -165,7 +165,7 @@ class EBNFComposer:
|
|
165
165
|
tool_call_separator: Optional[str] = None,
|
166
166
|
call_rule_fmt: Optional[str] = None,
|
167
167
|
key_value_rule_fmt: Optional[str] = None,
|
168
|
-
key_value_separator: str = ",",
|
168
|
+
key_value_separator: str = 'ws "," ws',
|
169
169
|
):
|
170
170
|
"""
|
171
171
|
Generalized EBNF builder for all detectors.
|
@@ -183,6 +183,10 @@ class EBNFComposer:
|
|
183
183
|
key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
|
184
184
|
with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
|
185
185
|
based on function_format will be used.
|
186
|
+
key_value_separator: Raw EBNF fragment inserted between key-value pairs.
|
187
|
+
This string is used verbatim (not auto-quoted). Pass:
|
188
|
+
- Quoted terminals when you need a literal token (e.g. '","' or '"\\n"').
|
189
|
+
- Raw/non-terminals when you need grammar tokens (e.g. 'ws "," ws').
|
186
190
|
"""
|
187
191
|
# =================================================================
|
188
192
|
# Step 1: Determine the root tool calls rule
|
@@ -281,9 +285,7 @@ class EBNFComposer:
|
|
281
285
|
# Add required properties joined by commas
|
282
286
|
if required:
|
283
287
|
rule_parts.append(
|
284
|
-
f
|
285
|
-
prop_kv_pairs[k] for k in required
|
286
|
-
)
|
288
|
+
f" {key_value_separator} ".join(prop_kv_pairs[k] for k in required)
|
287
289
|
)
|
288
290
|
|
289
291
|
# Add optional properties with flexible ordering
|
@@ -298,14 +300,14 @@ class EBNFComposer:
|
|
298
300
|
opt_parts.append(prop_kv_pairs[optional[j]])
|
299
301
|
else:
|
300
302
|
opt_parts.append(
|
301
|
-
f
|
303
|
+
f" ( {key_value_separator} {prop_kv_pairs[optional[j]]} )?"
|
302
304
|
)
|
303
305
|
opt_alternatives.append("".join(opt_parts))
|
304
306
|
|
305
307
|
# Wrap with appropriate comma handling based on whether we have required properties
|
306
308
|
if required:
|
307
309
|
# Required properties exist, so optional group needs outer comma
|
308
|
-
rule_parts.append(f
|
310
|
+
rule_parts.append(f" ( {key_value_separator} ( ")
|
309
311
|
rule_parts.append(" | ".join(opt_alternatives))
|
310
312
|
rule_parts.append(" ) )?")
|
311
313
|
else:
|
@@ -20,6 +20,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
|
|
20
20
|
from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
|
21
21
|
from sglang.srt.function_call.qwen25_detector import Qwen25Detector
|
22
22
|
from sglang.srt.function_call.step3_detector import Step3Detector
|
23
|
+
from sglang.srt.function_call.utils import get_json_schema_constraint
|
23
24
|
|
24
25
|
logger = logging.getLogger(__name__)
|
25
26
|
|
@@ -34,17 +35,19 @@ class FunctionCallParser:
|
|
34
35
|
"""
|
35
36
|
|
36
37
|
ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
|
37
|
-
"llama3": Llama32Detector,
|
38
|
-
"qwen25": Qwen25Detector,
|
39
|
-
"mistral": MistralDetector,
|
40
38
|
"deepseekv3": DeepSeekV3Detector,
|
41
39
|
"deepseekv31": DeepSeekV31Detector,
|
42
|
-
"
|
40
|
+
"glm": Glm4MoeDetector,
|
41
|
+
"glm45": Glm4MoeDetector,
|
42
|
+
"gpt-oss": GptOssDetector,
|
43
43
|
"kimi_k2": KimiK2Detector,
|
44
|
+
"llama3": Llama32Detector,
|
45
|
+
"mistral": MistralDetector,
|
46
|
+
"pythonic": PythonicDetector,
|
47
|
+
"qwen": Qwen25Detector,
|
48
|
+
"qwen25": Qwen25Detector,
|
44
49
|
"qwen3_coder": Qwen3CoderDetector,
|
45
|
-
"glm45": Glm4MoeDetector,
|
46
50
|
"step3": Step3Detector,
|
47
|
-
"gpt-oss": GptOssDetector,
|
48
51
|
}
|
49
52
|
|
50
53
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
@@ -69,6 +72,8 @@ class FunctionCallParser:
|
|
69
72
|
Returns:
|
70
73
|
True if the text contains a tool call, False otherwise
|
71
74
|
"""
|
75
|
+
if not self.tools:
|
76
|
+
return False
|
72
77
|
return self.detector.has_tool_call(text)
|
73
78
|
|
74
79
|
def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
|
@@ -83,6 +88,8 @@ class FunctionCallParser:
|
|
83
88
|
- The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
|
84
89
|
- A list of tool calls parsed from the text
|
85
90
|
"""
|
91
|
+
if not self.tools:
|
92
|
+
return full_text, []
|
86
93
|
parsed_result = self.detector.detect_and_parse(full_text, self.tools)
|
87
94
|
tool_call_list = parsed_result.calls
|
88
95
|
if tool_call_list:
|
@@ -102,6 +109,8 @@ class FunctionCallParser:
|
|
102
109
|
- The normal text that should be displayed to the user
|
103
110
|
- A list of tool calls parsed from the chunk
|
104
111
|
"""
|
112
|
+
if not self.tools:
|
113
|
+
return chunk_text, []
|
105
114
|
final_normal_text = ""
|
106
115
|
final_calls = []
|
107
116
|
|
@@ -172,8 +181,8 @@ class FunctionCallParser:
|
|
172
181
|
strict_tag = self.get_structure_tag()
|
173
182
|
return ("structural_tag", strict_tag)
|
174
183
|
elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
|
175
|
-
|
176
|
-
return ("
|
184
|
+
json_schema = get_json_schema_constraint(self.tools, tool_choice)
|
185
|
+
return ("json_schema", json_schema)
|
177
186
|
|
178
187
|
def get_ebnf(
|
179
188
|
self, tool_choice: Union[ToolChoice, Literal["required"]]
|
@@ -39,7 +39,7 @@ def parse_arguments(json_value):
|
|
39
39
|
|
40
40
|
class Glm4MoeDetector(BaseFormatDetector):
|
41
41
|
"""
|
42
|
-
Detector for GLM-4.5 models.
|
42
|
+
Detector for GLM-4.5 and GLM-4.6 models.
|
43
43
|
Assumes function call format:
|
44
44
|
<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
|
45
45
|
"""
|
@@ -53,7 +53,7 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
53
53
|
self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
|
54
54
|
|
55
55
|
def has_tool_call(self, text: str) -> bool:
|
56
|
-
"""Check if the text contains a glm-4.5 format tool call."""
|
56
|
+
"""Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
|
57
57
|
return self.bot_token in text
|
58
58
|
|
59
59
|
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
@@ -102,7 +102,7 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
102
102
|
self, new_text: str, tools: List[Tool]
|
103
103
|
) -> StreamingParseResult:
|
104
104
|
"""
|
105
|
-
Streaming incremental parsing tool calls for GLM-4.5 format.
|
105
|
+
Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
|
106
106
|
"""
|
107
107
|
self._buffer += new_text
|
108
108
|
current_text = self._buffer
|
@@ -160,5 +160,5 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
160
160
|
function_format="xml",
|
161
161
|
call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
|
162
162
|
key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
|
163
|
-
key_value_separator="\\n",
|
163
|
+
key_value_separator='"\\n"',
|
164
164
|
)
|
@@ -81,6 +81,29 @@ class GptOssDetector(BaseFormatDetector):
|
|
81
81
|
# Always use HarmonyParser for parsing to ensure proper filtering
|
82
82
|
events = self.harmony_parser.parse(new_text)
|
83
83
|
|
84
|
+
# If there are no parsed events and the chunk contains no Harmony structural
|
85
|
+
# markers, treat it as plain text and pass it through. This fixes a bug where
|
86
|
+
# normal content was held in the buffer when tools were provided but not used.
|
87
|
+
if not events:
|
88
|
+
has_harmony_markers = any(
|
89
|
+
marker in self._buffer
|
90
|
+
for marker in (
|
91
|
+
"<|start|>",
|
92
|
+
"<|channel|>",
|
93
|
+
"<|message|>",
|
94
|
+
"<|constrain|>",
|
95
|
+
"<|end|>",
|
96
|
+
"<|call|>",
|
97
|
+
"<|return|>",
|
98
|
+
"assistantfinal",
|
99
|
+
)
|
100
|
+
)
|
101
|
+
if not has_harmony_markers:
|
102
|
+
# Plain text with no tool markers — emit as normal content
|
103
|
+
out = self._buffer
|
104
|
+
self._buffer = ""
|
105
|
+
return StreamingParseResult(normal_text=out, calls=[])
|
106
|
+
|
84
107
|
# Quick check if we might have tool calls
|
85
108
|
if (
|
86
109
|
"<|channel|>commentary to=" not in self._buffer
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from sglang.srt.entrypoints.openai.protocol import Tool
|
6
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
7
|
+
from sglang.srt.function_call.core_types import StreamingParseResult
|
8
|
+
|
9
|
+
|
10
|
+
class JsonArrayParser(BaseFormatDetector):
|
11
|
+
"""
|
12
|
+
Parser for JSON array tool calls when JSON schema constraints are active.
|
13
|
+
|
14
|
+
This parser is used when tool_choice="required" or a specific tool is named,
|
15
|
+
bypassing model-specific parsers in favor of direct JSON array parsing.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self):
|
19
|
+
super().__init__()
|
20
|
+
# Configure for JSON array parsing
|
21
|
+
self.bot_token = "["
|
22
|
+
self.eot_token = "]"
|
23
|
+
self.tool_call_separator = ","
|
24
|
+
|
25
|
+
def has_tool_call(self, text: str) -> bool:
|
26
|
+
"""
|
27
|
+
Check if the given text contains a JSON tool call (array or single object).
|
28
|
+
"""
|
29
|
+
return "[" in text or "{" in text
|
30
|
+
|
31
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
32
|
+
"""
|
33
|
+
Parse JSON tool calls using the base class implementation.
|
34
|
+
"""
|
35
|
+
raise NotImplementedError(
|
36
|
+
"Detect and parse not supported for JSON schema constraints."
|
37
|
+
)
|
38
|
+
|
39
|
+
def build_ebnf(self, tools: List[Tool]) -> str:
|
40
|
+
"""
|
41
|
+
Build an EBNF grammar for constrained generation.
|
42
|
+
This is not used for JSON schema constraints as they are handled
|
43
|
+
by the constraint backends directly.
|
44
|
+
"""
|
45
|
+
raise NotImplementedError(
|
46
|
+
"EBNF generation is not supported for JSON schema constraints."
|
47
|
+
)
|
48
|
+
|
49
|
+
def parse_streaming_increment(
|
50
|
+
self, new_text: str, tools: List[Tool]
|
51
|
+
) -> StreamingParseResult:
|
52
|
+
"""
|
53
|
+
Streaming incremental parsing with tool validation.
|
54
|
+
"""
|
55
|
+
return super().parse_streaming_increment(new_text, tools)
|
56
|
+
|
57
|
+
def structure_info(self) -> callable:
|
58
|
+
"""
|
59
|
+
Return a function that creates StructureInfo for constrained generation.
|
60
|
+
This is not used for JSON schema constraints as they are handled
|
61
|
+
by the constraint backends directly.
|
62
|
+
"""
|
63
|
+
raise NotImplementedError("structure_info not used for JSON schema constraints")
|
@@ -50,6 +50,11 @@ class KimiK2Detector(BaseFormatDetector):
|
|
50
50
|
|
51
51
|
self._last_arguments = ""
|
52
52
|
|
53
|
+
# Robust parser for ids like "functions.search:0" or fallback "search:0"
|
54
|
+
self.tool_call_id_regex = re.compile(
|
55
|
+
r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
|
56
|
+
)
|
57
|
+
|
53
58
|
def has_tool_call(self, text: str) -> bool:
|
54
59
|
"""Check if the text contains a KimiK2 format tool call."""
|
55
60
|
return self.bot_token in text
|
@@ -76,14 +81,18 @@ class KimiK2Detector(BaseFormatDetector):
|
|
76
81
|
tool_calls = []
|
77
82
|
for match in function_call_tuples:
|
78
83
|
function_id, function_args = match
|
79
|
-
|
80
|
-
|
84
|
+
m = self.tool_call_id_regex.match(function_id)
|
85
|
+
if not m:
|
86
|
+
logger.warning("Unexpected tool_call_id format: %s", function_id)
|
87
|
+
continue
|
88
|
+
function_name = m.group("name")
|
89
|
+
function_idx = int(m.group("index"))
|
81
90
|
|
82
91
|
logger.info(f"function_name {function_name}")
|
83
92
|
|
84
93
|
tool_calls.append(
|
85
94
|
ToolCallItem(
|
86
|
-
tool_index=function_idx,
|
95
|
+
tool_index=function_idx,
|
87
96
|
name=function_name,
|
88
97
|
parameters=function_args,
|
89
98
|
)
|
@@ -128,7 +137,11 @@ class KimiK2Detector(BaseFormatDetector):
|
|
128
137
|
function_id = match.group("tool_call_id")
|
129
138
|
function_args = match.group("function_arguments")
|
130
139
|
|
131
|
-
|
140
|
+
m = self.tool_call_id_regex.match(function_id)
|
141
|
+
if not m:
|
142
|
+
logger.warning("Unexpected tool_call_id format: %s", function_id)
|
143
|
+
return StreamingParseResult(normal_text="", calls=calls)
|
144
|
+
function_name = m.group("name")
|
132
145
|
|
133
146
|
# Initialize state if this is the first tool call
|
134
147
|
if self.current_tool_id == -1:
|
@@ -358,5 +358,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
358
358
|
function_format="xml",
|
359
359
|
call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
360
360
|
key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
361
|
-
key_value_separator="\\n",
|
361
|
+
key_value_separator='"\\n"',
|
362
362
|
)
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import json
|
2
2
|
from json import JSONDecodeError, JSONDecoder
|
3
|
-
from
|
3
|
+
from json.decoder import WHITESPACE
|
4
|
+
from typing import Any, List, Literal, Optional, Tuple, Union
|
4
5
|
|
5
6
|
import partial_json_parser
|
6
7
|
from partial_json_parser.core.options import Allow
|
7
8
|
|
9
|
+
from sglang.srt.entrypoints.openai.protocol import Tool, ToolChoice
|
10
|
+
|
8
11
|
|
9
12
|
def _find_common_prefix(s1: str, s2: str) -> str:
|
10
13
|
prefix = ""
|
@@ -37,10 +40,12 @@ def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
|
|
37
40
|
"""
|
38
41
|
try:
|
39
42
|
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
40
|
-
except JSONDecodeError as e:
|
41
|
-
|
42
|
-
|
43
|
-
|
43
|
+
except (JSONDecodeError, IndexError) as e:
|
44
|
+
msg = getattr(e, "msg", str(e))
|
45
|
+
if "Extra data" in msg or "pop from empty list" in msg:
|
46
|
+
start = WHITESPACE.match(input_str, 0).end()
|
47
|
+
obj, end = JSONDecoder().raw_decode(input_str, start)
|
48
|
+
return obj, end
|
44
49
|
raise
|
45
50
|
|
46
51
|
|
@@ -50,3 +55,89 @@ def _is_complete_json(input_str: str) -> bool:
|
|
50
55
|
return True
|
51
56
|
except JSONDecodeError:
|
52
57
|
return False
|
58
|
+
|
59
|
+
|
60
|
+
def _get_tool_schema_defs(tools: List[Tool]) -> dict:
|
61
|
+
"""
|
62
|
+
Get consolidated $defs from all tools, validating for conflicts.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
tools: List of tools to process
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Dictionary of consolidated $defs from all tools
|
69
|
+
|
70
|
+
Raises:
|
71
|
+
ValueError: If conflicting $defs are found
|
72
|
+
"""
|
73
|
+
all_defs = {}
|
74
|
+
for tool in tools:
|
75
|
+
if tool.function.parameters is None:
|
76
|
+
continue
|
77
|
+
defs = tool.function.parameters.get("$defs", {})
|
78
|
+
for def_name, def_schema in defs.items():
|
79
|
+
if def_name in all_defs and all_defs[def_name] != def_schema:
|
80
|
+
raise ValueError(
|
81
|
+
f"Tool definition '{def_name}' has "
|
82
|
+
"multiple schemas, which is not "
|
83
|
+
"supported."
|
84
|
+
)
|
85
|
+
else:
|
86
|
+
all_defs[def_name] = def_schema
|
87
|
+
return all_defs
|
88
|
+
|
89
|
+
|
90
|
+
def _get_tool_schema(tool: Tool) -> dict:
|
91
|
+
return {
|
92
|
+
"properties": {
|
93
|
+
"name": {"type": "string", "enum": [tool.function.name]},
|
94
|
+
"parameters": (
|
95
|
+
tool.function.parameters
|
96
|
+
if tool.function.parameters
|
97
|
+
else {"type": "object", "properties": {}}
|
98
|
+
),
|
99
|
+
},
|
100
|
+
"required": ["name", "parameters"],
|
101
|
+
}
|
102
|
+
|
103
|
+
|
104
|
+
def get_json_schema_constraint(
|
105
|
+
tools: List[Tool], tool_choice: Union[ToolChoice, Literal["required"]]
|
106
|
+
) -> Optional[dict]:
|
107
|
+
"""
|
108
|
+
Get the JSON schema constraint for the specified tool choice.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
tool_choice: The tool choice specification
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
JSON schema dict, or None if no valid tools found
|
115
|
+
"""
|
116
|
+
|
117
|
+
if isinstance(tool_choice, ToolChoice):
|
118
|
+
# For specific function choice, return the user's parameters schema directly
|
119
|
+
fn_name = tool_choice.function.name
|
120
|
+
for tool in tools:
|
121
|
+
if tool.function.name == fn_name:
|
122
|
+
return {
|
123
|
+
"type": "array",
|
124
|
+
"minItems": 1,
|
125
|
+
"maxItems": 1,
|
126
|
+
"items": _get_tool_schema(tool),
|
127
|
+
}
|
128
|
+
return None
|
129
|
+
elif tool_choice == "required":
|
130
|
+
json_schema = {
|
131
|
+
"type": "array",
|
132
|
+
"minItems": 1,
|
133
|
+
"items": {
|
134
|
+
"type": "object",
|
135
|
+
"anyOf": [_get_tool_schema(tool) for tool in tools],
|
136
|
+
},
|
137
|
+
}
|
138
|
+
json_schema_defs = _get_tool_schema_defs(tools)
|
139
|
+
if json_schema_defs:
|
140
|
+
json_schema["$defs"] = json_schema_defs
|
141
|
+
return json_schema
|
142
|
+
|
143
|
+
return None
|
@@ -0,0 +1 @@
|
|
1
|
+
# SGLang gRPC module
|