sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +321 -31
- sglang/bench_serving.py +10 -3
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +8 -0
- sglang/srt/configs/model_config.py +160 -105
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/constrained/base_grammar_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -4
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/common/conn.py +266 -98
- sglang/srt/disaggregation/decode.py +50 -9
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/mooncake/conn.py +51 -541
- sglang/srt/disaggregation/nixl/conn.py +148 -39
- sglang/srt/disaggregation/prefill.py +31 -14
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +135 -80
- sglang/srt/entrypoints/engine.py +23 -3
- sglang/srt/entrypoints/grpc_request_manager.py +330 -55
- sglang/srt/entrypoints/grpc_server.py +232 -102
- sglang/srt/entrypoints/http_server.py +49 -9
- sglang/srt/entrypoints/openai/protocol.py +110 -5
- sglang/srt/entrypoints/openai/serving_base.py +25 -6
- sglang/srt/entrypoints/openai/serving_chat.py +178 -49
- sglang/srt/entrypoints/openai/serving_completions.py +5 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +42 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/function_call/function_call_parser.py +3 -2
- sglang/srt/function_call/glm4_moe_detector.py +3 -3
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
- sglang/srt/layers/activation.py +7 -6
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +108 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +112 -194
- sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
- sglang/srt/layers/attention/mamba/mamba.py +566 -1
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +42 -9
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +11 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +2 -0
- sglang/srt/layers/linear.py +21 -4
- sglang/srt/layers/logits_processor.py +15 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +147 -74
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
- sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
- sglang/srt/layers/moe/utils.py +10 -0
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/modelopt_quant.py +44 -9
- sglang/srt/layers/quantization/mxfp4.py +12 -4
- sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
- sglang/srt/layers/quantization/w4afp8.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +15 -3
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +52 -4
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +10 -4
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +17 -6
- sglang/srt/lora/mem_pool.py +1 -1
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +42 -142
- sglang/srt/managers/data_parallel_controller.py +11 -46
- sglang/srt/managers/detokenizer_manager.py +11 -11
- sglang/srt/managers/io_struct.py +162 -118
- sglang/srt/managers/mm_utils.py +43 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +167 -86
- sglang/srt/managers/schedule_policy.py +143 -16
- sglang/srt/managers/scheduler.py +359 -214
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
- sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
- sglang/srt/managers/tokenizer_manager.py +84 -136
- sglang/srt/managers/tp_worker.py +39 -29
- sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +40 -1
- sglang/srt/mem_cache/hiradix_cache.py +119 -32
- sglang/srt/mem_cache/memory_pool.py +188 -10
- sglang/srt/mem_cache/memory_pool_host.py +134 -182
- sglang/srt/mem_cache/radix_cache.py +222 -71
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
- sglang/srt/mem_cache/swa_radix_cache.py +25 -34
- sglang/srt/metrics/collector.py +82 -120
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -32
- sglang/srt/model_executor/forward_batch_info.py +23 -38
- sglang/srt/model_executor/model_runner.py +131 -183
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/loader.py +14 -10
- sglang/srt/model_loader/weight_utils.py +156 -2
- sglang/srt/models/bailing_moe.py +27 -4
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +536 -153
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +1 -1
- sglang/srt/models/glm4v_moe.py +1 -1
- sglang/srt/models/gpt_oss.py +7 -30
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/longcat_flash.py +1 -1
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +15 -4
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +2 -2
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +64 -1
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +31 -3
- sglang/srt/models/qwen3_next.py +36 -9
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +2 -3
- sglang/srt/multimodal/processors/internvl.py +20 -8
- sglang/srt/multimodal/processors/qwen_vl.py +8 -1
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +20 -2
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +753 -295
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
- sglang/srt/speculative/eagle_worker.py +57 -25
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +47 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +399 -74
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +1 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +12 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +355 -4
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -16,12 +16,14 @@
|
|
16
16
|
import time
|
17
17
|
import uuid
|
18
18
|
from dataclasses import dataclass
|
19
|
-
from typing import Any, Dict, List, Optional, TypeAlias, Union
|
19
|
+
from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
|
20
20
|
|
21
21
|
from openai.types.responses import (
|
22
22
|
ResponseFunctionToolCall,
|
23
23
|
ResponseInputItemParam,
|
24
24
|
ResponseOutputItem,
|
25
|
+
ResponseOutputMessage,
|
26
|
+
ResponseOutputText,
|
25
27
|
ResponseReasoningItem,
|
26
28
|
)
|
27
29
|
from openai.types.responses.response import ToolChoice
|
@@ -228,9 +230,15 @@ class CompletionRequest(BaseModel):
|
|
228
230
|
|
229
231
|
# For request id
|
230
232
|
rid: Optional[Union[List[str], str]] = None
|
233
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
234
|
+
extra_key: Optional[Union[List[str], str]] = None
|
235
|
+
# Cache salt for request caching
|
236
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
237
|
+
# Priority for the request
|
238
|
+
priority: Optional[int] = None
|
231
239
|
|
232
|
-
# For
|
233
|
-
|
240
|
+
# For custom metric labels
|
241
|
+
custom_labels: Optional[Dict[str, str]] = None
|
234
242
|
|
235
243
|
@field_validator("max_tokens")
|
236
244
|
@classmethod
|
@@ -337,7 +345,7 @@ class FunctionResponse(BaseModel):
|
|
337
345
|
"""Function response."""
|
338
346
|
|
339
347
|
name: Optional[str] = None
|
340
|
-
arguments: Optional[str] = None
|
348
|
+
arguments: Optional[str | Dict[str, Any]] = None
|
341
349
|
|
342
350
|
|
343
351
|
class ToolCall(BaseModel):
|
@@ -386,7 +394,7 @@ class Function(BaseModel):
|
|
386
394
|
"""Function descriptions."""
|
387
395
|
|
388
396
|
description: Optional[str] = Field(default=None, examples=[None])
|
389
|
-
name:
|
397
|
+
name: str
|
390
398
|
parameters: Optional[object] = None
|
391
399
|
strict: bool = False
|
392
400
|
|
@@ -543,6 +551,12 @@ class ChatCompletionRequest(BaseModel):
|
|
543
551
|
|
544
552
|
# For request id
|
545
553
|
rid: Optional[Union[List[str], str]] = None
|
554
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
555
|
+
extra_key: Optional[Union[List[str], str]] = None
|
556
|
+
# Cache salt for request caching
|
557
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
558
|
+
# Priority for the request
|
559
|
+
priority: Optional[int] = None
|
546
560
|
|
547
561
|
# For PD disaggregation
|
548
562
|
bootstrap_host: Optional[Union[List[str], str]] = None
|
@@ -644,6 +658,8 @@ class EmbeddingRequest(BaseModel):
|
|
644
658
|
|
645
659
|
# The request id.
|
646
660
|
rid: Optional[Union[List[str], str]] = None
|
661
|
+
# Priority for the request
|
662
|
+
priority: Optional[int] = None
|
647
663
|
|
648
664
|
|
649
665
|
class EmbeddingObject(BaseModel):
|
@@ -772,6 +788,13 @@ class ResponsesRequest(BaseModel):
|
|
772
788
|
description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
|
773
789
|
)
|
774
790
|
priority: int = Field(default=0, description="Request priority")
|
791
|
+
extra_key: Optional[str] = Field(
|
792
|
+
default=None,
|
793
|
+
description="Extra key for classifying the request (e.g. cache_salt)",
|
794
|
+
)
|
795
|
+
cache_salt: Optional[str] = Field(
|
796
|
+
default=None, description="Cache salt for request caching"
|
797
|
+
)
|
775
798
|
|
776
799
|
# SGLang-specific sampling parameters
|
777
800
|
frequency_penalty: float = 0.0
|
@@ -860,6 +883,26 @@ class ResponsesResponse(BaseModel):
|
|
860
883
|
tool_choice: str = "auto"
|
861
884
|
tools: List[ResponseTool] = Field(default_factory=list)
|
862
885
|
|
886
|
+
# OpenAI compatibility fields. not all are used at the moment.
|
887
|
+
# Recommend checking https://platform.openai.com/docs/api-reference/responses
|
888
|
+
error: Optional[dict] = None
|
889
|
+
incomplete_details: Optional[dict] = None # TODO(v) support this input
|
890
|
+
instructions: Optional[str] = None
|
891
|
+
max_output_tokens: Optional[int] = None
|
892
|
+
previous_response_id: Optional[str] = None
|
893
|
+
reasoning: Optional[dict] = (
|
894
|
+
# Unused. No model supports this. For GPT-oss, system prompt sets
|
895
|
+
# the field, not server args.
|
896
|
+
None # {"effort": Optional[str], "summary": Optional[str]}
|
897
|
+
)
|
898
|
+
store: Optional[bool] = None
|
899
|
+
temperature: Optional[float] = None
|
900
|
+
text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
|
901
|
+
top_p: Optional[float] = None
|
902
|
+
truncation: Optional[str] = None
|
903
|
+
user: Optional[str] = None
|
904
|
+
metadata: Optional[Dict[str, Any]] = None
|
905
|
+
|
863
906
|
@classmethod
|
864
907
|
def from_request(
|
865
908
|
cls,
|
@@ -874,6 +917,41 @@ class ResponsesResponse(BaseModel):
|
|
874
917
|
usage: Optional[UsageInfo],
|
875
918
|
) -> "ResponsesResponse":
|
876
919
|
"""Create a response from a request."""
|
920
|
+
|
921
|
+
# Determine if the output is plain text only to set text.format
|
922
|
+
def _is_text_only(
|
923
|
+
items: List[
|
924
|
+
Union[
|
925
|
+
ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
|
926
|
+
]
|
927
|
+
]
|
928
|
+
) -> bool:
|
929
|
+
if not items:
|
930
|
+
return False
|
931
|
+
for it in items:
|
932
|
+
# tool call -> not pure text.
|
933
|
+
if isinstance(it, ResponseReasoningItem) or isinstance(
|
934
|
+
it, ResponseFunctionToolCall
|
935
|
+
):
|
936
|
+
return False
|
937
|
+
try:
|
938
|
+
if isinstance(it, ResponseOutputText):
|
939
|
+
continue
|
940
|
+
elif isinstance(it, ResponseOutputMessage):
|
941
|
+
if not it.content:
|
942
|
+
continue
|
943
|
+
for c in it.content:
|
944
|
+
if not isinstance(c, ResponseOutputText):
|
945
|
+
return False
|
946
|
+
else:
|
947
|
+
# Unknown type, not considered text-only
|
948
|
+
return False
|
949
|
+
except AttributeError:
|
950
|
+
return False
|
951
|
+
return True
|
952
|
+
|
953
|
+
text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
|
954
|
+
|
877
955
|
return cls(
|
878
956
|
id=request.request_id,
|
879
957
|
created_at=created_time,
|
@@ -884,6 +962,23 @@ class ResponsesResponse(BaseModel):
|
|
884
962
|
parallel_tool_calls=request.parallel_tool_calls or True,
|
885
963
|
tool_choice=request.tool_choice,
|
886
964
|
tools=request.tools,
|
965
|
+
# fields for parity with v1/responses
|
966
|
+
error=None,
|
967
|
+
incomplete_details=None,
|
968
|
+
instructions=request.instructions,
|
969
|
+
max_output_tokens=request.max_output_tokens,
|
970
|
+
previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
|
971
|
+
reasoning={
|
972
|
+
"effort": request.reasoning.effort if request.reasoning else None,
|
973
|
+
"summary": None, # unused
|
974
|
+
},
|
975
|
+
store=request.store,
|
976
|
+
temperature=request.temperature,
|
977
|
+
text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
|
978
|
+
top_p=request.top_p,
|
979
|
+
truncation=request.truncation,
|
980
|
+
user=request.user,
|
981
|
+
metadata=request.metadata or {},
|
887
982
|
)
|
888
983
|
|
889
984
|
|
@@ -922,6 +1017,16 @@ class MessageProcessingResult:
|
|
922
1017
|
tool_call_constraint: Optional[Any] = None
|
923
1018
|
|
924
1019
|
|
1020
|
+
class ToolCallProcessingResult(NamedTuple):
|
1021
|
+
"""Result of processing tool calls in a response."""
|
1022
|
+
|
1023
|
+
tool_calls: Optional[
|
1024
|
+
List[Any]
|
1025
|
+
] # List of ToolCall objects or None if parsing failed
|
1026
|
+
remaining_text: str # Text remaining after parsing tool calls
|
1027
|
+
finish_reason: Dict[str, Any] # Updated finish reason dictionary
|
1028
|
+
|
1029
|
+
|
925
1030
|
class ResponseReasoningTextContent(BaseModel):
|
926
1031
|
text: str
|
927
1032
|
type: Literal["reasoning_text"] = "reasoning_text"
|
@@ -27,10 +27,10 @@ class OpenAIServingBase(ABC):
|
|
27
27
|
self.tokenizer_manager = tokenizer_manager
|
28
28
|
self.allowed_custom_labels = (
|
29
29
|
set(
|
30
|
-
self.tokenizer_manager.server_args.
|
30
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
31
31
|
)
|
32
32
|
if isinstance(self.tokenizer_manager.server_args, ServerArgs)
|
33
|
-
and self.tokenizer_manager.server_args.
|
33
|
+
and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
34
34
|
else None
|
35
35
|
)
|
36
36
|
|
@@ -62,6 +62,12 @@ class OpenAIServingBase(ABC):
|
|
62
62
|
return self.create_error_response(
|
63
63
|
message=e.detail, err_type=str(e.status_code), status_code=e.status_code
|
64
64
|
)
|
65
|
+
except ValueError as e:
|
66
|
+
return self.create_error_response(
|
67
|
+
message=str(e),
|
68
|
+
err_type="BadRequest",
|
69
|
+
status_code=400,
|
70
|
+
)
|
65
71
|
except Exception as e:
|
66
72
|
logger.exception(f"Error in request: {e}")
|
67
73
|
return self.create_error_response(
|
@@ -86,6 +92,19 @@ class OpenAIServingBase(ABC):
|
|
86
92
|
|
87
93
|
return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
|
88
94
|
|
95
|
+
def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
|
96
|
+
"""Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
|
97
|
+
parts = []
|
98
|
+
for key in ["cache_salt", "extra_key"]:
|
99
|
+
value = getattr(request, key, None)
|
100
|
+
if value:
|
101
|
+
if not isinstance(value, str):
|
102
|
+
raise TypeError(
|
103
|
+
f"Value of {key} must be a string, but got {type(value).__name__}"
|
104
|
+
)
|
105
|
+
parts.append(value)
|
106
|
+
return "".join(parts) if parts else None
|
107
|
+
|
89
108
|
@abstractmethod
|
90
109
|
def _convert_to_internal_request(
|
91
110
|
self,
|
@@ -165,14 +184,14 @@ class OpenAIServingBase(ABC):
|
|
165
184
|
)
|
166
185
|
return json.dumps({"error": error.model_dump()})
|
167
186
|
|
168
|
-
def
|
187
|
+
def extract_custom_labels(self, raw_request):
|
169
188
|
if (
|
170
189
|
not self.allowed_custom_labels
|
171
190
|
or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
172
191
|
):
|
173
192
|
return None
|
174
193
|
|
175
|
-
|
194
|
+
custom_labels = None
|
176
195
|
header = (
|
177
196
|
self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
178
197
|
)
|
@@ -187,9 +206,9 @@ class OpenAIServingBase(ABC):
|
|
187
206
|
raw_labels = None
|
188
207
|
|
189
208
|
if isinstance(raw_labels, dict):
|
190
|
-
|
209
|
+
custom_labels = {
|
191
210
|
label: value
|
192
211
|
for label, value in raw_labels.items()
|
193
212
|
if label in self.allowed_custom_labels
|
194
213
|
}
|
195
|
-
return
|
214
|
+
return custom_labels
|
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Uni
|
|
9
9
|
|
10
10
|
from fastapi import Request
|
11
11
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
12
|
+
from jsonschema import Draft202012Validator, SchemaError
|
12
13
|
|
13
14
|
from sglang.srt.entrypoints.openai.protocol import (
|
14
15
|
ChatCompletionRequest,
|
@@ -25,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
25
26
|
LogProbs,
|
26
27
|
MessageProcessingResult,
|
27
28
|
ToolCall,
|
29
|
+
ToolCallProcessingResult,
|
30
|
+
ToolChoice,
|
28
31
|
TopLogprob,
|
29
32
|
)
|
30
33
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
@@ -33,7 +36,10 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
33
36
|
process_hidden_states_from_ret,
|
34
37
|
to_openai_style_logprobs,
|
35
38
|
)
|
39
|
+
from sglang.srt.function_call.core_types import ToolCallItem
|
36
40
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
41
|
+
from sglang.srt.function_call.json_array_parser import JsonArrayParser
|
42
|
+
from sglang.srt.function_call.utils import get_json_schema_constraint
|
37
43
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
38
44
|
from sglang.srt.parser.conversation import generate_chat_conv
|
39
45
|
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
@@ -58,6 +64,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
58
64
|
super().__init__(tokenizer_manager)
|
59
65
|
self.template_manager = template_manager
|
60
66
|
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
67
|
+
self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
61
68
|
|
62
69
|
def _request_id_prefix(self) -> str:
|
63
70
|
return "chatcmpl-"
|
@@ -74,6 +81,23 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
74
81
|
):
|
75
82
|
return "Tools cannot be empty if tool choice is set to required."
|
76
83
|
|
84
|
+
if request.tool_choice is not None and not isinstance(request.tool_choice, str):
|
85
|
+
if not request.tools:
|
86
|
+
return "Tools cannot be empty if tool choice is set to a specific tool."
|
87
|
+
tool_name = request.tool_choice.function.name
|
88
|
+
tool_exists = any(tool.function.name == tool_name for tool in request.tools)
|
89
|
+
if not tool_exists:
|
90
|
+
return f"Tool '{tool_name}' not found in tools list."
|
91
|
+
|
92
|
+
# Validate tool definitions
|
93
|
+
for i, tool in enumerate(request.tools or []):
|
94
|
+
if tool.function.parameters is None:
|
95
|
+
continue
|
96
|
+
try:
|
97
|
+
Draft202012Validator.check_schema(tool.function.parameters)
|
98
|
+
except SchemaError as e:
|
99
|
+
return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
|
100
|
+
|
77
101
|
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
78
102
|
server_context_length = self.tokenizer_manager.server_args.context_length
|
79
103
|
if (
|
@@ -128,8 +152,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
128
152
|
else:
|
129
153
|
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
130
154
|
|
131
|
-
# Extract
|
132
|
-
|
155
|
+
# Extract custom labels from raw request headers
|
156
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
133
157
|
|
134
158
|
adapted_request = GenerateReqInput(
|
135
159
|
**prompt_kwargs,
|
@@ -149,7 +173,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
149
173
|
bootstrap_room=request.bootstrap_room,
|
150
174
|
return_hidden_states=request.return_hidden_states,
|
151
175
|
rid=request.rid,
|
152
|
-
|
176
|
+
extra_key=self._compute_extra_key(request),
|
177
|
+
priority=request.priority,
|
178
|
+
custom_labels=custom_labels,
|
153
179
|
)
|
154
180
|
|
155
181
|
return adapted_request, request
|
@@ -187,6 +213,14 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
187
213
|
tool_call_constraint = parser.get_structure_constraint(
|
188
214
|
request.tool_choice
|
189
215
|
)
|
216
|
+
# Handle JSON schema constraint directly for required or named tool choice
|
217
|
+
if request.tool_choice == "required" or isinstance(
|
218
|
+
request.tool_choice, ToolChoice
|
219
|
+
):
|
220
|
+
json_schema = get_json_schema_constraint(
|
221
|
+
request.tools, request.tool_choice
|
222
|
+
)
|
223
|
+
tool_call_constraint = ("json_schema", json_schema)
|
190
224
|
|
191
225
|
# Use chat template
|
192
226
|
if self.template_manager.chat_template_name is None:
|
@@ -434,6 +468,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
434
468
|
sampling_params[constraint_type] = convert_json_schema_to_str(
|
435
469
|
constraint_value.model_dump(by_alias=True)
|
436
470
|
)
|
471
|
+
elif constraint_type == "json_schema":
|
472
|
+
sampling_params[constraint_type] = convert_json_schema_to_str(
|
473
|
+
constraint_value
|
474
|
+
)
|
437
475
|
else:
|
438
476
|
sampling_params[constraint_type] = constraint_value
|
439
477
|
return sampling_params
|
@@ -526,10 +564,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
526
564
|
stream_buffers[index] = stream_buffer + delta
|
527
565
|
|
528
566
|
# Handle reasoning content
|
529
|
-
if
|
530
|
-
self.tokenizer_manager.server_args.reasoning_parser
|
531
|
-
and request.separate_reasoning
|
532
|
-
):
|
567
|
+
if self.reasoning_parser and request.separate_reasoning:
|
533
568
|
reasoning_text, delta = self._process_reasoning_stream(
|
534
569
|
index, delta, reasoning_parser_dict, content, request
|
535
570
|
)
|
@@ -719,7 +754,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
719
754
|
|
720
755
|
# Handle reasoning content
|
721
756
|
reasoning_text = None
|
722
|
-
reasoning_parser = self.
|
757
|
+
reasoning_parser = self.reasoning_parser
|
723
758
|
if reasoning_parser and request.separate_reasoning:
|
724
759
|
is_force_reasoning = (
|
725
760
|
self.template_manager.force_reasoning
|
@@ -747,8 +782,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
747
782
|
and request.tools
|
748
783
|
and self.tool_call_parser
|
749
784
|
):
|
785
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
750
786
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
751
|
-
text,
|
787
|
+
text,
|
788
|
+
request.tools,
|
789
|
+
finish_reason,
|
790
|
+
request.tool_choice,
|
791
|
+
history_tool_calls_cnt,
|
752
792
|
)
|
753
793
|
|
754
794
|
choice_data = ChatCompletionResponseChoice(
|
@@ -838,13 +878,76 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
838
878
|
token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
|
839
879
|
return ChoiceLogprobs(content=token_logprobs)
|
840
880
|
|
881
|
+
def _process_tool_call_id(
|
882
|
+
self,
|
883
|
+
call_item: ToolCallItem,
|
884
|
+
history_tool_calls_cnt: int,
|
885
|
+
) -> str:
|
886
|
+
"""Process for generating a new and unique `tool_call_id`"""
|
887
|
+
if self.tool_call_parser != "kimi_k2":
|
888
|
+
# A simple uuid is sufficient for all models except for Kimi-K2.
|
889
|
+
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
890
|
+
return tool_call_id
|
891
|
+
else:
|
892
|
+
# Align with Kimi-K2 format: functions.{name}:{index}
|
893
|
+
# Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
|
894
|
+
# Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
|
895
|
+
tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
|
896
|
+
logger.debug(
|
897
|
+
f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
|
898
|
+
)
|
899
|
+
return tool_call_id
|
900
|
+
|
841
901
|
def _process_tool_calls(
|
842
902
|
self,
|
843
903
|
text: str,
|
844
904
|
tools: List[Any],
|
845
905
|
finish_reason: Dict[str, Any],
|
846
|
-
|
906
|
+
tool_choice: Optional[Union[str, ToolChoice]] = None,
|
907
|
+
history_tool_calls_cnt: int = 0,
|
908
|
+
) -> ToolCallProcessingResult:
|
847
909
|
"""Process tool calls in the response"""
|
910
|
+
|
911
|
+
# Handle required or named tool choice
|
912
|
+
if tool_choice == "required" or (
|
913
|
+
isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
|
914
|
+
):
|
915
|
+
# Set finish reason to tool_calls since we're processing tool calls
|
916
|
+
if finish_reason["type"] == "stop":
|
917
|
+
finish_reason["type"] = "tool_calls"
|
918
|
+
finish_reason["matched"] = None
|
919
|
+
try:
|
920
|
+
# For required tool choice, we expect a JSON array of tool calls
|
921
|
+
tool_call_data = json.loads(text)
|
922
|
+
tool_calls = []
|
923
|
+
for i, tool in enumerate(tool_call_data):
|
924
|
+
# Create a ToolCallItem from the JSON data
|
925
|
+
call_info = ToolCallItem(
|
926
|
+
tool_index=i, # Use the loop index as tool_index
|
927
|
+
name=tool["name"],
|
928
|
+
parameters=json.dumps(tool["parameters"], ensure_ascii=False),
|
929
|
+
)
|
930
|
+
tool_id = self._process_tool_call_id(
|
931
|
+
call_info, history_tool_calls_cnt
|
932
|
+
)
|
933
|
+
tool_calls.append(
|
934
|
+
ToolCall(
|
935
|
+
id=tool_id,
|
936
|
+
index=i,
|
937
|
+
function=FunctionResponse(
|
938
|
+
name=tool["name"],
|
939
|
+
arguments=json.dumps(
|
940
|
+
tool["parameters"], ensure_ascii=False
|
941
|
+
),
|
942
|
+
),
|
943
|
+
)
|
944
|
+
)
|
945
|
+
return ToolCallProcessingResult(tool_calls, "", finish_reason)
|
946
|
+
except json.JSONDecodeError as e:
|
947
|
+
logger.error(f"Tool call parsing error: {e}")
|
948
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
949
|
+
|
950
|
+
# Use parser since output is not constrained by JSON schema
|
848
951
|
parser = FunctionCallParser(tools, self.tool_call_parser)
|
849
952
|
if parser.has_tool_call(text):
|
850
953
|
if finish_reason["type"] == "stop":
|
@@ -854,15 +957,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
854
957
|
text, call_info_list = parser.parse_non_stream(text)
|
855
958
|
tool_calls = []
|
856
959
|
for call_info in call_info_list:
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
and call_info.name is not None
|
861
|
-
):
|
862
|
-
tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
|
863
|
-
else:
|
864
|
-
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
865
|
-
|
960
|
+
tool_id = self._process_tool_call_id(
|
961
|
+
call_info, history_tool_calls_cnt
|
962
|
+
)
|
866
963
|
tool_calls.append(
|
867
964
|
ToolCall(
|
868
965
|
id=tool_id,
|
@@ -872,13 +969,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
872
969
|
),
|
873
970
|
)
|
874
971
|
)
|
875
|
-
return tool_calls, text, finish_reason
|
972
|
+
return ToolCallProcessingResult(tool_calls, text, finish_reason)
|
876
973
|
except Exception as e:
|
877
974
|
logger.error(f"Tool call parsing error: {e}")
|
878
975
|
# Return error but don't fail the whole request
|
879
|
-
return None, text, finish_reason
|
976
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
880
977
|
|
881
|
-
return None, text, finish_reason
|
978
|
+
return ToolCallProcessingResult(None, text, finish_reason)
|
882
979
|
|
883
980
|
def _process_streaming_logprobs(
|
884
981
|
self, content: Dict[str, Any], n_prev_token: int
|
@@ -911,13 +1008,33 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
911
1008
|
or self._get_enable_thinking_from_request(request)
|
912
1009
|
)
|
913
1010
|
reasoning_parser_dict[index] = ReasoningParser(
|
914
|
-
self.
|
1011
|
+
self.reasoning_parser,
|
915
1012
|
request.stream_reasoning,
|
916
1013
|
is_force_reasoning,
|
917
1014
|
)
|
918
1015
|
reasoning_parser = reasoning_parser_dict[index]
|
919
1016
|
return reasoning_parser.parse_stream_chunk(delta)
|
920
1017
|
|
1018
|
+
def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
|
1019
|
+
"""Counts the number of tool calls in the request's message history.
|
1020
|
+
|
1021
|
+
NOTE: This method is only useful for models that include self-increasing
|
1022
|
+
history tool call idx in tool calls id, such as kimi-k2
|
1023
|
+
|
1024
|
+
Args:
|
1025
|
+
request: The chat completion request object.
|
1026
|
+
|
1027
|
+
Returns:
|
1028
|
+
The total number of tool calls in the history, or 0 if not applicable.
|
1029
|
+
"""
|
1030
|
+
messages = getattr(request, "messages", [])
|
1031
|
+
idx = 0
|
1032
|
+
for msg in messages:
|
1033
|
+
if msg.role == "assistant":
|
1034
|
+
tool_calls = getattr(msg, "tool_calls", None)
|
1035
|
+
idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
|
1036
|
+
return idx
|
1037
|
+
|
921
1038
|
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
922
1039
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
923
1040
|
|
@@ -931,11 +1048,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
931
1048
|
"""
|
932
1049
|
if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
|
933
1050
|
# For Qwen3 models, `enable_thinking` is supported.
|
934
|
-
if
|
935
|
-
return request.chat_template_kwargs.get("enable_thinking")
|
1051
|
+
if self.reasoning_parser in ["qwen3", "glm45"]:
|
1052
|
+
return request.chat_template_kwargs.get("enable_thinking", False)
|
936
1053
|
# For DeepSeek-V3.1 models, `thinking` is supported.
|
937
|
-
elif
|
938
|
-
return request.chat_template_kwargs.get("thinking")
|
1054
|
+
elif self.reasoning_parser in ["deepseek-v3"]:
|
1055
|
+
return request.chat_template_kwargs.get("thinking", False)
|
939
1056
|
else:
|
940
1057
|
return False
|
941
1058
|
return False
|
@@ -951,13 +1068,25 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
951
1068
|
):
|
952
1069
|
"""Process tool calls in streaming response"""
|
953
1070
|
if index not in parser_dict:
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
)
|
1071
|
+
# Use JSON detector directly for required or named tool choice
|
1072
|
+
if request.tool_choice == "required" or isinstance(
|
1073
|
+
request.tool_choice, ToolChoice
|
1074
|
+
):
|
1075
|
+
parser_dict[index] = JsonArrayParser()
|
1076
|
+
else:
|
1077
|
+
parser_dict[index] = FunctionCallParser(
|
1078
|
+
tools=request.tools,
|
1079
|
+
tool_call_parser=self.tool_call_parser,
|
1080
|
+
)
|
1081
|
+
|
958
1082
|
parser = parser_dict[index]
|
959
1083
|
|
960
|
-
|
1084
|
+
# Handle both FunctionCallParser and JsonArrayParser
|
1085
|
+
if isinstance(parser, JsonArrayParser):
|
1086
|
+
result = parser.parse_streaming_increment(delta, request.tools)
|
1087
|
+
normal_text, calls = result.normal_text, result.calls
|
1088
|
+
else:
|
1089
|
+
normal_text, calls = parser.parse_stream_chunk(delta)
|
961
1090
|
|
962
1091
|
# Yield normal text
|
963
1092
|
if normal_text:
|
@@ -975,6 +1104,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
975
1104
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
976
1105
|
|
977
1106
|
# Yield tool calls
|
1107
|
+
history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
|
978
1108
|
for call_item in calls:
|
979
1109
|
# Mark that this choice has tool calls
|
980
1110
|
has_tool_calls[index] = True
|
@@ -982,11 +1112,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
982
1112
|
# Tool call ID should be generated only once per tool call
|
983
1113
|
if call_item.name:
|
984
1114
|
# First chunk: include ID and function name
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
else:
|
989
|
-
tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
|
1115
|
+
tool_call_id = self._process_tool_call_id(
|
1116
|
+
call_item, history_tool_calls_cnt
|
1117
|
+
)
|
990
1118
|
function_name = call_item.name
|
991
1119
|
else:
|
992
1120
|
# Subsequent chunks: null ID and name for argument deltas
|
@@ -1017,7 +1145,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
1017
1145
|
|
1018
1146
|
def _check_for_unstreamed_tool_args(
|
1019
1147
|
self,
|
1020
|
-
parser: FunctionCallParser,
|
1148
|
+
parser: Union[FunctionCallParser, JsonArrayParser],
|
1021
1149
|
content: Dict[str, Any],
|
1022
1150
|
request: ChatCompletionRequest,
|
1023
1151
|
index: int,
|
@@ -1027,30 +1155,31 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
1027
1155
|
when generation finishes. This ensures tool calls are properly completed
|
1028
1156
|
even if the model generates the final arguments in the last chunk.
|
1029
1157
|
"""
|
1030
|
-
#
|
1158
|
+
# Get the detector - either from FunctionCallParser or directly if json detector
|
1159
|
+
detector = parser.detector if hasattr(parser, "detector") else parser
|
1160
|
+
|
1161
|
+
# Only check if we have tool calls and the detector has tracked data
|
1031
1162
|
if (
|
1032
|
-
not hasattr(
|
1033
|
-
or not
|
1163
|
+
not hasattr(detector, "prev_tool_call_arr")
|
1164
|
+
or not detector.prev_tool_call_arr
|
1034
1165
|
):
|
1035
1166
|
return None
|
1036
1167
|
|
1037
1168
|
if (
|
1038
|
-
not hasattr(
|
1039
|
-
or not
|
1169
|
+
not hasattr(detector, "streamed_args_for_tool")
|
1170
|
+
or not detector.streamed_args_for_tool
|
1040
1171
|
):
|
1041
1172
|
return None
|
1042
1173
|
|
1043
1174
|
# Get the last tool call that was being processed
|
1044
|
-
tool_index = len(
|
1045
|
-
if tool_index < 0 or tool_index >= len(
|
1175
|
+
tool_index = len(detector.prev_tool_call_arr) - 1
|
1176
|
+
if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
|
1046
1177
|
return None
|
1047
1178
|
|
1048
1179
|
# Get expected vs actual arguments
|
1049
|
-
expected_args =
|
1050
|
-
"arguments", {}
|
1051
|
-
)
|
1180
|
+
expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
|
1052
1181
|
expected_call = json.dumps(expected_args, ensure_ascii=False)
|
1053
|
-
actual_call =
|
1182
|
+
actual_call = detector.streamed_args_for_tool[tool_index]
|
1054
1183
|
|
1055
1184
|
# Check if there are remaining arguments to send
|
1056
1185
|
remaining_call = (
|