sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +321 -31
- sglang/bench_serving.py +10 -3
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +8 -0
- sglang/srt/configs/model_config.py +160 -105
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/constrained/base_grammar_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -4
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/common/conn.py +266 -98
- sglang/srt/disaggregation/decode.py +50 -9
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/mooncake/conn.py +51 -541
- sglang/srt/disaggregation/nixl/conn.py +148 -39
- sglang/srt/disaggregation/prefill.py +31 -14
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +135 -80
- sglang/srt/entrypoints/engine.py +23 -3
- sglang/srt/entrypoints/grpc_request_manager.py +330 -55
- sglang/srt/entrypoints/grpc_server.py +232 -102
- sglang/srt/entrypoints/http_server.py +49 -9
- sglang/srt/entrypoints/openai/protocol.py +110 -5
- sglang/srt/entrypoints/openai/serving_base.py +25 -6
- sglang/srt/entrypoints/openai/serving_chat.py +178 -49
- sglang/srt/entrypoints/openai/serving_completions.py +5 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +42 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/function_call/function_call_parser.py +3 -2
- sglang/srt/function_call/glm4_moe_detector.py +3 -3
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
- sglang/srt/layers/activation.py +7 -6
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +108 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +112 -194
- sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
- sglang/srt/layers/attention/mamba/mamba.py +566 -1
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +42 -9
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +11 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +2 -0
- sglang/srt/layers/linear.py +21 -4
- sglang/srt/layers/logits_processor.py +15 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +147 -74
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
- sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
- sglang/srt/layers/moe/utils.py +10 -0
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/modelopt_quant.py +44 -9
- sglang/srt/layers/quantization/mxfp4.py +12 -4
- sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
- sglang/srt/layers/quantization/w4afp8.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +15 -3
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +52 -4
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +10 -4
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +17 -6
- sglang/srt/lora/mem_pool.py +1 -1
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +42 -142
- sglang/srt/managers/data_parallel_controller.py +11 -46
- sglang/srt/managers/detokenizer_manager.py +11 -11
- sglang/srt/managers/io_struct.py +162 -118
- sglang/srt/managers/mm_utils.py +43 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +167 -86
- sglang/srt/managers/schedule_policy.py +143 -16
- sglang/srt/managers/scheduler.py +359 -214
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
- sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
- sglang/srt/managers/tokenizer_manager.py +84 -136
- sglang/srt/managers/tp_worker.py +39 -29
- sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +40 -1
- sglang/srt/mem_cache/hiradix_cache.py +119 -32
- sglang/srt/mem_cache/memory_pool.py +188 -10
- sglang/srt/mem_cache/memory_pool_host.py +134 -182
- sglang/srt/mem_cache/radix_cache.py +222 -71
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
- sglang/srt/mem_cache/swa_radix_cache.py +25 -34
- sglang/srt/metrics/collector.py +82 -120
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -32
- sglang/srt/model_executor/forward_batch_info.py +23 -38
- sglang/srt/model_executor/model_runner.py +131 -183
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/loader.py +14 -10
- sglang/srt/model_loader/weight_utils.py +156 -2
- sglang/srt/models/bailing_moe.py +27 -4
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +536 -153
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +1 -1
- sglang/srt/models/glm4v_moe.py +1 -1
- sglang/srt/models/gpt_oss.py +7 -30
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/longcat_flash.py +1 -1
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +15 -4
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +2 -2
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +64 -1
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +31 -3
- sglang/srt/models/qwen3_next.py +36 -9
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +2 -3
- sglang/srt/multimodal/processors/internvl.py +20 -8
- sglang/srt/multimodal/processors/qwen_vl.py +8 -1
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +20 -2
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +753 -295
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
- sglang/srt/speculative/eagle_worker.py +57 -25
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +47 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +399 -74
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +1 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +12 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +355 -4
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -90,8 +90,8 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
90
90
|
else:
|
91
91
|
prompt_kwargs = {"input_ids": prompt}
|
92
92
|
|
93
|
-
# Extract
|
94
|
-
|
93
|
+
# Extract custom labels from raw request headers
|
94
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
95
95
|
|
96
96
|
adapted_request = GenerateReqInput(
|
97
97
|
**prompt_kwargs,
|
@@ -107,7 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
107
107
|
bootstrap_room=request.bootstrap_room,
|
108
108
|
return_hidden_states=request.return_hidden_states,
|
109
109
|
rid=request.rid,
|
110
|
-
|
110
|
+
extra_key=self._compute_extra_key(request),
|
111
|
+
priority=request.priority,
|
112
|
+
custom_labels=custom_labels,
|
111
113
|
)
|
112
114
|
|
113
115
|
return adapted_request, request
|
@@ -123,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
123
123
|
|
124
124
|
self.background_tasks: dict[str, asyncio.Task] = {}
|
125
125
|
|
126
|
+
# error helpers dedicated for v1/responses
|
127
|
+
def create_error_response(
|
128
|
+
self,
|
129
|
+
message: str,
|
130
|
+
err_type: str = "invalid_request_error",
|
131
|
+
status_code: int = 400,
|
132
|
+
param: Optional[str] = None,
|
133
|
+
) -> ORJSONResponse:
|
134
|
+
nested_error = {
|
135
|
+
"message": message,
|
136
|
+
"type": err_type,
|
137
|
+
"param": param,
|
138
|
+
"code": status_code,
|
139
|
+
}
|
140
|
+
return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
|
141
|
+
|
142
|
+
def create_streaming_error_response(
|
143
|
+
self,
|
144
|
+
message: str,
|
145
|
+
err_type: str = "BadRequestError",
|
146
|
+
status_code: int = 400,
|
147
|
+
) -> str:
|
148
|
+
return json.dumps(
|
149
|
+
{
|
150
|
+
"error": {
|
151
|
+
"message": message,
|
152
|
+
"type": err_type,
|
153
|
+
"param": None,
|
154
|
+
"code": status_code,
|
155
|
+
}
|
156
|
+
}
|
157
|
+
)
|
158
|
+
|
126
159
|
def _request_id_prefix(self) -> str:
|
127
160
|
return "resp_"
|
128
161
|
|
@@ -245,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
245
278
|
sampling_params=sampling_params,
|
246
279
|
stream=request.stream,
|
247
280
|
rid=request.request_id,
|
281
|
+
extra_key=self._compute_extra_key(request),
|
248
282
|
background=request.background,
|
249
283
|
)
|
250
284
|
|
@@ -833,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
833
867
|
|
834
868
|
async for ctx in result_generator:
|
835
869
|
|
870
|
+
# Only process context objects that implement the `is_expecting_start()` method,
|
871
|
+
# which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
|
872
|
+
# Contexts without this method are skipped, as they do not represent a new turn
|
873
|
+
# or are not compatible with per-turn handling in the /v1/responses endpoint.
|
874
|
+
if not hasattr(ctx, "is_expecting_start"):
|
875
|
+
continue
|
876
|
+
|
836
877
|
if ctx.is_expecting_start():
|
837
878
|
current_output_index += 1
|
838
879
|
sent_output_item_added = False
|
@@ -1250,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
1250
1291
|
sampling_params=sampling_params,
|
1251
1292
|
stream=adapted_request.stream,
|
1252
1293
|
rid=request_id,
|
1294
|
+
extra_key=adapted_request.extra_key,
|
1253
1295
|
return_logprob=adapted_request.return_logprob,
|
1254
1296
|
logprob_start_len=adapted_request.logprob_start_len,
|
1255
1297
|
top_logprobs_num=adapted_request.top_logprobs_num,
|
sglang/srt/environ.py
ADDED
@@ -0,0 +1,285 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import warnings
|
4
|
+
from contextlib import ExitStack, contextmanager
|
5
|
+
from typing import Any
|
6
|
+
|
7
|
+
|
8
|
+
class EnvField:
|
9
|
+
def __init__(self, default: Any):
|
10
|
+
self.default = default
|
11
|
+
# NOTE: we use None to indicate whether the value is set or not
|
12
|
+
# If the value is manually set to None, we need mark it as _set_to_none.
|
13
|
+
# Always use clear() to reset the value, which leads to the default fallback.
|
14
|
+
self._set_to_none = False
|
15
|
+
|
16
|
+
def __set_name__(self, owner, name):
|
17
|
+
self.name = name
|
18
|
+
|
19
|
+
def parse(self, value: str) -> Any:
|
20
|
+
raise NotImplementedError()
|
21
|
+
|
22
|
+
def get(self) -> Any:
|
23
|
+
value = os.getenv(self.name)
|
24
|
+
if self._set_to_none:
|
25
|
+
assert value is None
|
26
|
+
return None
|
27
|
+
|
28
|
+
if value is None:
|
29
|
+
return self.default
|
30
|
+
|
31
|
+
try:
|
32
|
+
return self.parse(value)
|
33
|
+
except ValueError as e:
|
34
|
+
warnings.warn(
|
35
|
+
f'Invalid value for {self.name}: {e}, using default "{self.default}"'
|
36
|
+
)
|
37
|
+
return self.default
|
38
|
+
|
39
|
+
def is_set(self):
|
40
|
+
# NOTE: If None is manually set, it is considered as set.
|
41
|
+
return self.name in os.environ or self._set_to_none
|
42
|
+
|
43
|
+
def get_set_value_or(self, or_value: Any):
|
44
|
+
# NOTE: Ugly usage, but only way to get custom default value.
|
45
|
+
return self.get() if self.is_set() else or_value
|
46
|
+
|
47
|
+
def set(self, value: Any):
|
48
|
+
if value is None:
|
49
|
+
self._set_to_none = True
|
50
|
+
os.environ.pop(self.name, None)
|
51
|
+
else:
|
52
|
+
self._set_to_none = False
|
53
|
+
os.environ[self.name] = str(value)
|
54
|
+
|
55
|
+
@contextmanager
|
56
|
+
def override(self, value: Any):
|
57
|
+
backup_present = self.name in os.environ
|
58
|
+
backup_value = os.environ.get(self.name)
|
59
|
+
backup_set_to_none = self._set_to_none
|
60
|
+
self.set(value)
|
61
|
+
yield
|
62
|
+
if backup_present:
|
63
|
+
os.environ[self.name] = backup_value
|
64
|
+
else:
|
65
|
+
os.environ.pop(self.name, None)
|
66
|
+
self._set_to_none = backup_set_to_none
|
67
|
+
|
68
|
+
def clear(self):
|
69
|
+
os.environ.pop(self.name, None)
|
70
|
+
self._set_to_none = False
|
71
|
+
|
72
|
+
@property
|
73
|
+
def value(self):
|
74
|
+
return self.get()
|
75
|
+
|
76
|
+
|
77
|
+
class EnvStr(EnvField):
|
78
|
+
def parse(self, value: str) -> str:
|
79
|
+
return value
|
80
|
+
|
81
|
+
|
82
|
+
class EnvBool(EnvField):
|
83
|
+
def parse(self, value: str) -> bool:
|
84
|
+
value = value.lower()
|
85
|
+
if value in ["true", "1", "yes", "y"]:
|
86
|
+
return True
|
87
|
+
if value in ["false", "0", "no", "n"]:
|
88
|
+
return False
|
89
|
+
raise ValueError(f'"{value}" is not a valid boolean value')
|
90
|
+
|
91
|
+
|
92
|
+
class EnvInt(EnvField):
|
93
|
+
def parse(self, value: str) -> int:
|
94
|
+
try:
|
95
|
+
return int(value)
|
96
|
+
except ValueError:
|
97
|
+
raise ValueError(f'"{value}" is not a valid integer value')
|
98
|
+
|
99
|
+
|
100
|
+
class EnvFloat(EnvField):
|
101
|
+
def parse(self, value: str) -> float:
|
102
|
+
try:
|
103
|
+
return float(value)
|
104
|
+
except ValueError:
|
105
|
+
raise ValueError(f'"{value}" is not a valid float value')
|
106
|
+
|
107
|
+
|
108
|
+
class Envs:
|
109
|
+
# fmt: off
|
110
|
+
|
111
|
+
# Model & File Download
|
112
|
+
SGLANG_USE_MODELSCOPE = EnvBool(False)
|
113
|
+
|
114
|
+
# Test & Debug
|
115
|
+
SGLANG_IS_IN_CI = EnvBool(False)
|
116
|
+
SGLANG_AMD_CI = EnvBool(False)
|
117
|
+
SGLANG_TEST_RETRACT = EnvBool(False)
|
118
|
+
SGLANG_SET_CPU_AFFINITY = EnvBool(False)
|
119
|
+
SGLANG_PROFILE_WITH_STACK = EnvBool(True)
|
120
|
+
SGLANG_RECORD_STEP_TIME = EnvBool(False)
|
121
|
+
SGLANG_GC_LOG = EnvBool(False)
|
122
|
+
SGLANG_FORCE_SHUTDOWN = EnvBool(False)
|
123
|
+
SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
|
124
|
+
SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
|
125
|
+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
|
126
|
+
SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
|
127
|
+
SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
|
128
|
+
SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
|
129
|
+
SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
|
130
|
+
|
131
|
+
# Model Parallel
|
132
|
+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
|
133
|
+
|
134
|
+
# Constrained Decoding
|
135
|
+
SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
|
136
|
+
SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
|
137
|
+
|
138
|
+
# Hi-Cache
|
139
|
+
SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
|
140
|
+
|
141
|
+
# Mooncake KV Transfer
|
142
|
+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
|
143
|
+
ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
|
144
|
+
|
145
|
+
# AMD & ROCm
|
146
|
+
SGLANG_USE_AITER = EnvBool(False)
|
147
|
+
SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
|
148
|
+
|
149
|
+
# Quantization
|
150
|
+
SGLANG_INT4_WEIGHT = EnvBool(False)
|
151
|
+
SGLANG_CPU_QUANTIZATION = EnvBool(False)
|
152
|
+
SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
|
153
|
+
SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
|
154
|
+
|
155
|
+
# Flashinfer
|
156
|
+
SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
|
157
|
+
SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
|
158
|
+
|
159
|
+
# Triton
|
160
|
+
SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
|
161
|
+
|
162
|
+
# Torch Compile
|
163
|
+
SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
|
164
|
+
|
165
|
+
# EPLB
|
166
|
+
SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
|
167
|
+
SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
|
168
|
+
SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
|
169
|
+
SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
|
170
|
+
|
171
|
+
# TBO
|
172
|
+
SGLANG_TBO_DEBUG = EnvBool(False)
|
173
|
+
|
174
|
+
# DeepGemm
|
175
|
+
SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
|
176
|
+
SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
|
177
|
+
SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
|
178
|
+
SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
|
179
|
+
SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
|
180
|
+
SGLANG_DG_USE_NVRTC = EnvBool(False)
|
181
|
+
SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
|
182
|
+
|
183
|
+
# sgl-kernel
|
184
|
+
SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
|
185
|
+
|
186
|
+
# vLLM dependencies
|
187
|
+
USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
|
188
|
+
USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
|
189
|
+
|
190
|
+
USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
|
191
|
+
RETURN_ORIGINAL_LOGPROB = EnvBool(False)
|
192
|
+
SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
|
193
|
+
SGLANG_MOE_PADDING = EnvBool(False)
|
194
|
+
SGLANG_CUTLASS_MOE = EnvBool(False)
|
195
|
+
HF_HUB_DISABLE_XET = EnvBool(False)
|
196
|
+
DISABLE_OPENAPI_DOC = EnvBool(False)
|
197
|
+
SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
|
198
|
+
SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
|
199
|
+
SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
|
200
|
+
SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
|
201
|
+
SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
|
202
|
+
|
203
|
+
# Deterministic inference
|
204
|
+
SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
|
205
|
+
SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
|
206
|
+
SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
|
207
|
+
SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
|
208
|
+
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
|
209
|
+
|
210
|
+
# fmt: on
|
211
|
+
|
212
|
+
|
213
|
+
envs = Envs()
|
214
|
+
|
215
|
+
|
216
|
+
def _convert_SGL_to_SGLANG():
|
217
|
+
for key, value in os.environ.items():
|
218
|
+
if key.startswith("SGL_"):
|
219
|
+
new_key = key.replace("SGL_", "SGLANG_", 1)
|
220
|
+
warnings.warn(
|
221
|
+
f"Environment variable {key} is deprecated, please use {new_key}"
|
222
|
+
)
|
223
|
+
os.environ[new_key] = value
|
224
|
+
|
225
|
+
|
226
|
+
_convert_SGL_to_SGLANG()
|
227
|
+
|
228
|
+
|
229
|
+
def example_with_exit_stack():
|
230
|
+
# Use this style of context manager in unit test
|
231
|
+
exit_stack = ExitStack()
|
232
|
+
exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
|
233
|
+
assert envs.SGLANG_TEST_RETRACT.value is False
|
234
|
+
exit_stack.close()
|
235
|
+
assert envs.SGLANG_TEST_RETRACT.value is None
|
236
|
+
|
237
|
+
|
238
|
+
def example_with_subprocess():
|
239
|
+
command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
|
240
|
+
with envs.SGLANG_TEST_RETRACT.override(True):
|
241
|
+
process = subprocess.Popen(
|
242
|
+
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
243
|
+
)
|
244
|
+
process.wait()
|
245
|
+
output = process.stdout.read().decode("utf-8").strip()
|
246
|
+
assert output == "True"
|
247
|
+
|
248
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
249
|
+
output = process.stdout.read().decode("utf-8").strip()
|
250
|
+
assert output == "None"
|
251
|
+
|
252
|
+
|
253
|
+
def examples():
|
254
|
+
# Example usage for envs
|
255
|
+
envs.SGLANG_TEST_RETRACT.clear()
|
256
|
+
assert envs.SGLANG_TEST_RETRACT.value is False
|
257
|
+
|
258
|
+
envs.SGLANG_TEST_RETRACT.set(None)
|
259
|
+
assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
|
260
|
+
|
261
|
+
envs.SGLANG_TEST_RETRACT.clear()
|
262
|
+
assert not envs.SGLANG_TEST_RETRACT.is_set()
|
263
|
+
|
264
|
+
envs.SGLANG_TEST_RETRACT.set(True)
|
265
|
+
assert envs.SGLANG_TEST_RETRACT.value is True
|
266
|
+
|
267
|
+
with envs.SGLANG_TEST_RETRACT.override(None):
|
268
|
+
assert (
|
269
|
+
envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
|
270
|
+
)
|
271
|
+
|
272
|
+
assert envs.SGLANG_TEST_RETRACT.value is True
|
273
|
+
|
274
|
+
envs.SGLANG_TEST_RETRACT.set(None)
|
275
|
+
with envs.SGLANG_TEST_RETRACT.override(True):
|
276
|
+
assert envs.SGLANG_TEST_RETRACT.value is True
|
277
|
+
|
278
|
+
assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
|
279
|
+
|
280
|
+
example_with_exit_stack()
|
281
|
+
example_with_subprocess()
|
282
|
+
|
283
|
+
|
284
|
+
if __name__ == "__main__":
|
285
|
+
examples()
|
@@ -231,6 +231,7 @@ class ExpertLocationMetadata:
|
|
231
231
|
logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
|
232
232
|
logical_to_rank_dispatch_physical_map=(
|
233
233
|
compute_logical_to_rank_dispatch_physical_map(
|
234
|
+
server_args=server_args,
|
234
235
|
logical_to_all_physical_map=logical_to_all_physical_map,
|
235
236
|
num_gpus=ep_size,
|
236
237
|
num_physical_experts=num_physical_experts,
|
@@ -340,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
|
|
340
341
|
|
341
342
|
# TODO optimize performance (rewrite and/or run in separate process with overlap)
|
342
343
|
def compute_logical_to_rank_dispatch_physical_map(
|
344
|
+
server_args: ServerArgs,
|
343
345
|
logical_to_all_physical_map: torch.Tensor,
|
344
346
|
num_gpus: int,
|
345
347
|
num_physical_experts: int,
|
@@ -348,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
|
|
348
350
|
):
|
349
351
|
r = random.Random(seed)
|
350
352
|
|
351
|
-
|
353
|
+
num_local_gpu_physical_experts = num_physical_experts // num_gpus
|
354
|
+
num_gpus_per_node = server_args.ep_size // server_args.nnodes
|
355
|
+
num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
|
352
356
|
num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
|
353
357
|
dtype = logical_to_all_physical_map.dtype
|
354
358
|
|
@@ -372,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
|
|
372
376
|
physical_expert_id
|
373
377
|
for physical_expert_id in candidate_physical_expert_ids
|
374
378
|
if _compute_gpu_id_of_physical_expert(
|
375
|
-
physical_expert_id,
|
379
|
+
physical_expert_id, num_local_gpu_physical_experts
|
376
380
|
)
|
377
381
|
== gpu_id
|
378
382
|
]
|
379
383
|
if len(same_gpu_physical_expert_ids) > 0:
|
384
|
+
# 1. Prefer same-GPU experts
|
380
385
|
output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
|
381
|
-
|
386
|
+
else:
|
387
|
+
# 2. Otherwise, prefer same-node experts
|
388
|
+
node_id = gpu_id // num_gpus_per_node
|
389
|
+
same_node_physical_expert_ids = [
|
390
|
+
physical_expert_id
|
391
|
+
for physical_expert_id in candidate_physical_expert_ids
|
392
|
+
if _compute_node_id_of_physical_expert(
|
393
|
+
physical_expert_id, num_local_node_physical_experts
|
394
|
+
)
|
395
|
+
== node_id
|
396
|
+
]
|
397
|
+
if len(same_node_physical_expert_ids) > 0:
|
398
|
+
output_partial[gpu_id] = same_node_physical_expert_ids[0]
|
399
|
+
|
400
|
+
# 3. Fill remaining slots with fair random choices
|
382
401
|
num_remain = torch.sum(output_partial == -1).item()
|
383
402
|
output_partial[output_partial == -1] = torch.tensor(
|
384
403
|
_fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
|
@@ -404,9 +423,15 @@ def _logical_to_all_physical_raw(
|
|
404
423
|
|
405
424
|
|
406
425
|
def _compute_gpu_id_of_physical_expert(
|
407
|
-
physical_expert_id: int,
|
426
|
+
physical_expert_id: int, num_local_gpu_physical_experts: int
|
427
|
+
) -> int:
|
428
|
+
return physical_expert_id // num_local_gpu_physical_experts
|
429
|
+
|
430
|
+
|
431
|
+
def _compute_node_id_of_physical_expert(
|
432
|
+
physical_expert_id: int, num_local_host_physical_experts: int
|
408
433
|
) -> int:
|
409
|
-
return physical_expert_id //
|
434
|
+
return physical_expert_id // num_local_host_physical_experts
|
410
435
|
|
411
436
|
|
412
437
|
def _fair_choices(arr: List, k: int, r: random.Random) -> List:
|
@@ -20,6 +20,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
|
|
20
20
|
from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
|
21
21
|
from sglang.srt.function_call.qwen25_detector import Qwen25Detector
|
22
22
|
from sglang.srt.function_call.step3_detector import Step3Detector
|
23
|
+
from sglang.srt.function_call.utils import get_json_schema_constraint
|
23
24
|
|
24
25
|
logger = logging.getLogger(__name__)
|
25
26
|
|
@@ -178,8 +179,8 @@ class FunctionCallParser:
|
|
178
179
|
strict_tag = self.get_structure_tag()
|
179
180
|
return ("structural_tag", strict_tag)
|
180
181
|
elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
|
181
|
-
|
182
|
-
return ("
|
182
|
+
json_schema = get_json_schema_constraint(self.tools, tool_choice)
|
183
|
+
return ("json_schema", json_schema)
|
183
184
|
|
184
185
|
def get_ebnf(
|
185
186
|
self, tool_choice: Union[ToolChoice, Literal["required"]]
|
@@ -39,7 +39,7 @@ def parse_arguments(json_value):
|
|
39
39
|
|
40
40
|
class Glm4MoeDetector(BaseFormatDetector):
|
41
41
|
"""
|
42
|
-
Detector for GLM-4.5 models.
|
42
|
+
Detector for GLM-4.5 and GLM-4.6 models.
|
43
43
|
Assumes function call format:
|
44
44
|
<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
|
45
45
|
"""
|
@@ -53,7 +53,7 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
53
53
|
self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
|
54
54
|
|
55
55
|
def has_tool_call(self, text: str) -> bool:
|
56
|
-
"""Check if the text contains a glm-4.5 format tool call."""
|
56
|
+
"""Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
|
57
57
|
return self.bot_token in text
|
58
58
|
|
59
59
|
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
@@ -102,7 +102,7 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
102
102
|
self, new_text: str, tools: List[Tool]
|
103
103
|
) -> StreamingParseResult:
|
104
104
|
"""
|
105
|
-
Streaming incremental parsing tool calls for GLM-4.5 format.
|
105
|
+
Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
|
106
106
|
"""
|
107
107
|
self._buffer += new_text
|
108
108
|
current_text = self._buffer
|
@@ -81,6 +81,29 @@ class GptOssDetector(BaseFormatDetector):
|
|
81
81
|
# Always use HarmonyParser for parsing to ensure proper filtering
|
82
82
|
events = self.harmony_parser.parse(new_text)
|
83
83
|
|
84
|
+
# If there are no parsed events and the chunk contains no Harmony structural
|
85
|
+
# markers, treat it as plain text and pass it through. This fixes a bug where
|
86
|
+
# normal content was held in the buffer when tools were provided but not used.
|
87
|
+
if not events:
|
88
|
+
has_harmony_markers = any(
|
89
|
+
marker in self._buffer
|
90
|
+
for marker in (
|
91
|
+
"<|start|>",
|
92
|
+
"<|channel|>",
|
93
|
+
"<|message|>",
|
94
|
+
"<|constrain|>",
|
95
|
+
"<|end|>",
|
96
|
+
"<|call|>",
|
97
|
+
"<|return|>",
|
98
|
+
"assistantfinal",
|
99
|
+
)
|
100
|
+
)
|
101
|
+
if not has_harmony_markers:
|
102
|
+
# Plain text with no tool markers — emit as normal content
|
103
|
+
out = self._buffer
|
104
|
+
self._buffer = ""
|
105
|
+
return StreamingParseResult(normal_text=out, calls=[])
|
106
|
+
|
84
107
|
# Quick check if we might have tool calls
|
85
108
|
if (
|
86
109
|
"<|channel|>commentary to=" not in self._buffer
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import json
|
2
|
+
import re
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from sglang.srt.entrypoints.openai.protocol import Tool
|
6
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
7
|
+
from sglang.srt.function_call.core_types import StreamingParseResult
|
8
|
+
|
9
|
+
|
10
|
+
class JsonArrayParser(BaseFormatDetector):
|
11
|
+
"""
|
12
|
+
Parser for JSON array tool calls when JSON schema constraints are active.
|
13
|
+
|
14
|
+
This parser is used when tool_choice="required" or a specific tool is named,
|
15
|
+
bypassing model-specific parsers in favor of direct JSON array parsing.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self):
|
19
|
+
super().__init__()
|
20
|
+
# Configure for JSON array parsing
|
21
|
+
self.bot_token = "["
|
22
|
+
self.eot_token = "]"
|
23
|
+
self.tool_call_separator = ","
|
24
|
+
|
25
|
+
def has_tool_call(self, text: str) -> bool:
|
26
|
+
"""
|
27
|
+
Check if the given text contains a JSON tool call (array or single object).
|
28
|
+
"""
|
29
|
+
return "[" in text or "{" in text
|
30
|
+
|
31
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
32
|
+
"""
|
33
|
+
Parse JSON tool calls using the base class implementation.
|
34
|
+
"""
|
35
|
+
raise NotImplementedError(
|
36
|
+
"Detect and parse not supported for JSON schema constraints."
|
37
|
+
)
|
38
|
+
|
39
|
+
def build_ebnf(self, tools: List[Tool]) -> str:
|
40
|
+
"""
|
41
|
+
Build an EBNF grammar for constrained generation.
|
42
|
+
This is not used for JSON schema constraints as they are handled
|
43
|
+
by the constraint backends directly.
|
44
|
+
"""
|
45
|
+
raise NotImplementedError(
|
46
|
+
"EBNF generation is not supported for JSON schema constraints."
|
47
|
+
)
|
48
|
+
|
49
|
+
def parse_streaming_increment(
|
50
|
+
self, new_text: str, tools: List[Tool]
|
51
|
+
) -> StreamingParseResult:
|
52
|
+
"""
|
53
|
+
Streaming incremental parsing with tool validation.
|
54
|
+
"""
|
55
|
+
return super().parse_streaming_increment(new_text, tools)
|
56
|
+
|
57
|
+
def structure_info(self) -> callable:
|
58
|
+
"""
|
59
|
+
Return a function that creates StructureInfo for constrained generation.
|
60
|
+
This is not used for JSON schema constraints as they are handled
|
61
|
+
by the constraint backends directly.
|
62
|
+
"""
|
63
|
+
raise NotImplementedError("structure_info not used for JSON schema constraints")
|
@@ -50,6 +50,11 @@ class KimiK2Detector(BaseFormatDetector):
|
|
50
50
|
|
51
51
|
self._last_arguments = ""
|
52
52
|
|
53
|
+
# Robust parser for ids like "functions.search:0" or fallback "search:0"
|
54
|
+
self.tool_call_id_regex = re.compile(
|
55
|
+
r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
|
56
|
+
)
|
57
|
+
|
53
58
|
def has_tool_call(self, text: str) -> bool:
|
54
59
|
"""Check if the text contains a KimiK2 format tool call."""
|
55
60
|
return self.bot_token in text
|
@@ -76,14 +81,18 @@ class KimiK2Detector(BaseFormatDetector):
|
|
76
81
|
tool_calls = []
|
77
82
|
for match in function_call_tuples:
|
78
83
|
function_id, function_args = match
|
79
|
-
|
80
|
-
|
84
|
+
m = self.tool_call_id_regex.match(function_id)
|
85
|
+
if not m:
|
86
|
+
logger.warning("Unexpected tool_call_id format: %s", function_id)
|
87
|
+
continue
|
88
|
+
function_name = m.group("name")
|
89
|
+
function_idx = int(m.group("index"))
|
81
90
|
|
82
91
|
logger.info(f"function_name {function_name}")
|
83
92
|
|
84
93
|
tool_calls.append(
|
85
94
|
ToolCallItem(
|
86
|
-
tool_index=function_idx,
|
95
|
+
tool_index=function_idx,
|
87
96
|
name=function_name,
|
88
97
|
parameters=function_args,
|
89
98
|
)
|
@@ -128,7 +137,11 @@ class KimiK2Detector(BaseFormatDetector):
|
|
128
137
|
function_id = match.group("tool_call_id")
|
129
138
|
function_args = match.group("function_arguments")
|
130
139
|
|
131
|
-
|
140
|
+
m = self.tool_call_id_regex.match(function_id)
|
141
|
+
if not m:
|
142
|
+
logger.warning("Unexpected tool_call_id format: %s", function_id)
|
143
|
+
return StreamingParseResult(normal_text="", calls=calls)
|
144
|
+
function_name = m.group("name")
|
132
145
|
|
133
146
|
# Initialize state if this is the first tool call
|
134
147
|
if self.current_tool_id == -1:
|