sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
- sglang/bench_one_batch_server.py +340 -34
- sglang/bench_serving.py +340 -159
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +9 -2
- sglang/profiler.py +20 -3
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +309 -0
- sglang/srt/configs/load_config.py +33 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +284 -118
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +576 -0
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +6 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -15
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +268 -98
- sglang/srt/disaggregation/decode.py +172 -39
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +203 -555
- sglang/srt/disaggregation/nixl/conn.py +217 -63
- sglang/srt/disaggregation/prefill.py +113 -270
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +203 -97
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +85 -65
- sglang/srt/entrypoints/grpc_server.py +632 -305
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +169 -17
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +327 -34
- sglang/srt/entrypoints/openai/serving_base.py +74 -8
- sglang/srt/entrypoints/openai/serving_chat.py +202 -118
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +20 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +47 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +323 -0
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +21 -16
- sglang/srt/function_call/glm4_moe_detector.py +4 -8
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +61 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +98 -7
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/grpc_request_manager.py +915 -0
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
- sglang/srt/layers/activation.py +11 -7
- sglang/srt/layers/attention/aiter_backend.py +17 -18
- sglang/srt/layers/attention/ascend_backend.py +125 -10
- sglang/srt/layers/attention/attention_registry.py +226 -0
- sglang/srt/layers/attention/base_attn_backend.py +32 -4
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +52 -15
- sglang/srt/layers/attention/flashinfer_backend.py +357 -212
- sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
- sglang/srt/layers/attention/flashmla_backend.py +9 -7
- sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
- sglang/srt/layers/attention/mamba/mamba.py +514 -1
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +23 -0
- sglang/srt/layers/attention/nsa_backend.py +1201 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +249 -42
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
- sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +61 -3
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +19 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +28 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +47 -15
- sglang/srt/layers/linear.py +30 -5
- sglang/srt/layers/logits_processor.py +161 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
- sglang/srt/layers/moe/ep_moe/layer.py +243 -448
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +27 -1
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +86 -20
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +43 -15
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +141 -81
- sglang/srt/layers/quantization/mxfp4.py +17 -34
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -24
- sglang/srt/layers/quantization/w8a8_int8.py +45 -27
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +750 -46
- sglang/srt/layers/sampler.py +84 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +23 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +9 -4
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +33 -7
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +41 -17
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +83 -152
- sglang/srt/managers/data_parallel_controller.py +156 -87
- sglang/srt/managers/detokenizer_manager.py +51 -24
- sglang/srt/managers/io_struct.py +223 -129
- sglang/srt/managers/mm_utils.py +49 -10
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +130 -0
- sglang/srt/managers/schedule_batch.py +340 -529
- sglang/srt/managers/schedule_policy.py +158 -18
- sglang/srt/managers/scheduler.py +665 -620
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
- sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
- sglang/srt/managers/tokenizer_manager.py +462 -226
- sglang/srt/managers/tp_worker.py +217 -156
- sglang/srt/managers/utils.py +79 -47
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +42 -28
- sglang/srt/mem_cache/base_prefix_cache.py +3 -3
- sglang/srt/mem_cache/chunk_cache.py +20 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +38 -0
- sglang/srt/mem_cache/hicache_storage.py +44 -2
- sglang/srt/mem_cache/hiradix_cache.py +134 -34
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +602 -208
- sglang/srt/mem_cache/memory_pool_host.py +134 -183
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +263 -78
- sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +115 -58
- sglang/srt/metrics/collector.py +113 -120
- sglang/srt/metrics/func_timer.py +3 -8
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +81 -36
- sglang/srt/model_executor/forward_batch_info.py +40 -50
- sglang/srt/model_executor/model_runner.py +507 -319
- sglang/srt/model_executor/npu_graph_runner.py +11 -5
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +438 -37
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +200 -27
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +40 -56
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +25 -4
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +793 -235
- sglang/srt/models/dots_ocr.py +171 -0
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +570 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -3
- sglang/srt/models/glm4_moe.py +17 -40
- sglang/srt/models/glm4_moe_nextn.py +4 -4
- sglang/srt/models/glm4v.py +3 -2
- sglang/srt/models/glm4v_moe.py +6 -6
- sglang/srt/models/gpt_oss.py +12 -35
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +4 -2
- sglang/srt/models/llama.py +6 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +6 -23
- sglang/srt/models/longcat_flash_nextn.py +4 -15
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +27 -6
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +5 -5
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +70 -4
- sglang/srt/models/qwen2_vl.py +6 -3
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +50 -38
- sglang/srt/models/qwen3_next.py +43 -21
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +791 -0
- sglang/srt/models/qwen3_vl_moe.py +343 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +268 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +61 -0
- sglang/srt/multimodal/processors/base_processor.py +21 -9
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +2 -4
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +20 -10
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +83 -17
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +36 -23
- sglang/srt/sampling/sampling_params.py +75 -0
- sglang/srt/server_args.py +1300 -338
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +161 -0
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
- sglang/srt/speculative/eagle_info.py +786 -0
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +113 -1270
- sglang/srt/speculative/eagle_worker.py +120 -285
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/ngram_info.py +433 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +49 -0
- sglang/srt/speculative/spec_utils.py +641 -0
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +35 -18
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/{utils.py → utils/common.py} +583 -113
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +120 -11
- sglang/test/runners.py +3 -1
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +8 -2
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +3 -4
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +430 -0
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +93 -1
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +432 -16
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
- sglang/srt/entrypoints/grpc_request_manager.py +0 -580
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import torch.nn.functional as F
|
|
10
|
+
from fastapi import Request
|
|
11
|
+
from fastapi.responses import ORJSONResponse
|
|
12
|
+
|
|
13
|
+
from sglang.srt.entrypoints.openai.protocol import (
|
|
14
|
+
ClassifyRequest,
|
|
15
|
+
ClassifyResponse,
|
|
16
|
+
ErrorResponse,
|
|
17
|
+
)
|
|
18
|
+
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
|
19
|
+
from sglang.srt.managers.io_struct import EmbeddingReqInput
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
|
23
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OpenAIServingClassify(OpenAIServingBase):
|
|
29
|
+
"""Handler for v1/classify requests"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
tokenizer_manager: TokenizerManager,
|
|
34
|
+
template_manager: TemplateManager,
|
|
35
|
+
):
|
|
36
|
+
super().__init__(tokenizer_manager)
|
|
37
|
+
self.template_manager = template_manager
|
|
38
|
+
self.id2label = self._get_id2label_mapping()
|
|
39
|
+
self.model_name = (
|
|
40
|
+
self.tokenizer_manager.served_model_name
|
|
41
|
+
if self.tokenizer_manager.served_model_name
|
|
42
|
+
else self.tokenizer_manager.server_args.model_path
|
|
43
|
+
)
|
|
44
|
+
if not self.id2label:
|
|
45
|
+
raise ValueError("id2label mapping is missing")
|
|
46
|
+
|
|
47
|
+
def _request_id_prefix(self) -> str:
|
|
48
|
+
return "classify-"
|
|
49
|
+
|
|
50
|
+
def _convert_to_internal_request(
|
|
51
|
+
self,
|
|
52
|
+
request: ClassifyRequest,
|
|
53
|
+
raw_request: Request = None,
|
|
54
|
+
) -> tuple[EmbeddingReqInput, ClassifyRequest]:
|
|
55
|
+
"""Convert OpenAI embedding request to internal format"""
|
|
56
|
+
prompt = request.input
|
|
57
|
+
|
|
58
|
+
if isinstance(prompt, str):
|
|
59
|
+
# Single string input
|
|
60
|
+
prompt_kwargs = {"text": prompt}
|
|
61
|
+
elif isinstance(prompt, list):
|
|
62
|
+
if len(prompt) > 0 and isinstance(prompt[0], str):
|
|
63
|
+
prompt_kwargs = {"text": prompt}
|
|
64
|
+
else:
|
|
65
|
+
# List of integers (token IDs) or empty list
|
|
66
|
+
prompt_kwargs = {"input_ids": prompt}
|
|
67
|
+
else:
|
|
68
|
+
# Other types (should not happen but handle gracefully)
|
|
69
|
+
prompt_kwargs = {"input_ids": prompt}
|
|
70
|
+
|
|
71
|
+
adapted_request = EmbeddingReqInput(
|
|
72
|
+
**prompt_kwargs,
|
|
73
|
+
rid=request.rid,
|
|
74
|
+
priority=request.priority,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return adapted_request, request
|
|
78
|
+
|
|
79
|
+
def _validate_request(self, request: ClassifyRequest) -> Optional[str]:
|
|
80
|
+
"""Validate that the input is not empty or whitespace only."""
|
|
81
|
+
if not (input := request.input):
|
|
82
|
+
return "Input cannot be empty"
|
|
83
|
+
|
|
84
|
+
# Handle single string
|
|
85
|
+
if isinstance(input, str):
|
|
86
|
+
if not input.strip():
|
|
87
|
+
return "Input cannot be empty or whitespace only"
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
# Handle list inputs
|
|
91
|
+
if isinstance(input, list):
|
|
92
|
+
# Check first element to determine type
|
|
93
|
+
first_item = input[0]
|
|
94
|
+
|
|
95
|
+
if isinstance(first_item, str):
|
|
96
|
+
# List of strings
|
|
97
|
+
for i, item in enumerate(input):
|
|
98
|
+
if not isinstance(item, str):
|
|
99
|
+
return f"All items in input list must be strings"
|
|
100
|
+
if not item.strip():
|
|
101
|
+
return f"Input at index {i} cannot be empty or whitespace only"
|
|
102
|
+
elif isinstance(first_item, int):
|
|
103
|
+
# List of integers (token IDs)
|
|
104
|
+
for i, item in enumerate(input):
|
|
105
|
+
if not isinstance(item, int):
|
|
106
|
+
return f"All items in input list must be integers"
|
|
107
|
+
if item < 0:
|
|
108
|
+
return f"Token ID at index {i} must be non-negative"
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
def _get_id2label_mapping(self) -> Optional[Dict[int, str]]:
|
|
112
|
+
"""Get id2label mapping from model config."""
|
|
113
|
+
try:
|
|
114
|
+
hf_config = self.tokenizer_manager.model_config.hf_config
|
|
115
|
+
# Check for id2label in hf_config
|
|
116
|
+
if hf_config.id2label:
|
|
117
|
+
return hf_config.id2label
|
|
118
|
+
# Check for num_labels and create default mapping if needed
|
|
119
|
+
if hasattr(hf_config, "num_labels") and hf_config.num_labels:
|
|
120
|
+
num_labels = hf_config.num_labels
|
|
121
|
+
# Create default mapping: {0: "LABEL_0", 1: "LABEL_1", ...}
|
|
122
|
+
return {i: f"LABEL_{i}" for i in range(num_labels)}
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.warning(f"Failed to get id2label mapping: {e}")
|
|
126
|
+
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
async def _handle_non_streaming_request(
|
|
130
|
+
self,
|
|
131
|
+
adapted_request: EmbeddingReqInput,
|
|
132
|
+
request: ClassifyRequest,
|
|
133
|
+
raw_request: Request,
|
|
134
|
+
) -> Union[ClassifyResponse, ErrorResponse, ORJSONResponse]:
|
|
135
|
+
"""Handle non-streaming classification request."""
|
|
136
|
+
# Generate request ID
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
ret = await self.tokenizer_manager.generate_request(
|
|
140
|
+
adapted_request, raw_request
|
|
141
|
+
).__anext__()
|
|
142
|
+
except ValueError as e:
|
|
143
|
+
return self.create_error_response(str(e))
|
|
144
|
+
|
|
145
|
+
if not isinstance(ret, list):
|
|
146
|
+
ret = [ret]
|
|
147
|
+
|
|
148
|
+
response = self._build_classify_response(ret)
|
|
149
|
+
return response
|
|
150
|
+
|
|
151
|
+
def _build_classify_response(self, ret: List[Dict[str, Any]]) -> ClassifyResponse:
|
|
152
|
+
request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
|
|
153
|
+
created_time = int(time.time())
|
|
154
|
+
classify_objects = []
|
|
155
|
+
prompt_tokens = 0
|
|
156
|
+
total_latency = 0.0
|
|
157
|
+
|
|
158
|
+
for i, item in enumerate(ret):
|
|
159
|
+
embedding = item.get("embedding", [])
|
|
160
|
+
meta_info = item.get("meta_info", {})
|
|
161
|
+
|
|
162
|
+
prompt_tokens += meta_info.get("prompt_tokens", 0)
|
|
163
|
+
total_latency += meta_info.get("e2e_latency", 0.0)
|
|
164
|
+
|
|
165
|
+
if embedding:
|
|
166
|
+
try:
|
|
167
|
+
embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
|
|
168
|
+
probs = F.softmax(embedding_tensor, dim=0).tolist()
|
|
169
|
+
|
|
170
|
+
predicted_class = torch.argmax(embedding_tensor).item()
|
|
171
|
+
|
|
172
|
+
label = self.id2label[predicted_class]
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Error processing embedding for item {i}: {e}")
|
|
176
|
+
probs = [1.0]
|
|
177
|
+
label = "Default"
|
|
178
|
+
else:
|
|
179
|
+
probs = [1.0]
|
|
180
|
+
label = "Default"
|
|
181
|
+
|
|
182
|
+
classify_obj = {
|
|
183
|
+
"index": i,
|
|
184
|
+
"label": label,
|
|
185
|
+
"probs": probs,
|
|
186
|
+
"num_classes": len(probs),
|
|
187
|
+
}
|
|
188
|
+
classify_objects.append(classify_obj)
|
|
189
|
+
|
|
190
|
+
response = {
|
|
191
|
+
"id": request_id,
|
|
192
|
+
"object": "list",
|
|
193
|
+
"created": created_time,
|
|
194
|
+
"model": self.model_name,
|
|
195
|
+
"data": classify_objects,
|
|
196
|
+
"usage": {
|
|
197
|
+
"prompt_tokens": prompt_tokens,
|
|
198
|
+
"total_tokens": prompt_tokens,
|
|
199
|
+
"completion_tokens": 0,
|
|
200
|
+
"prompt_tokens_details": None,
|
|
201
|
+
},
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return ClassifyResponse(**response)
|
|
@@ -90,8 +90,19 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
|
90
90
|
else:
|
|
91
91
|
prompt_kwargs = {"input_ids": prompt}
|
|
92
92
|
|
|
93
|
-
# Extract
|
|
94
|
-
|
|
93
|
+
# Extract custom labels from raw request headers
|
|
94
|
+
custom_labels = self.extract_custom_labels(raw_request)
|
|
95
|
+
|
|
96
|
+
# Resolve LoRA adapter from model parameter or explicit lora_path
|
|
97
|
+
lora_path = self._resolve_lora_path(request.model, request.lora_path)
|
|
98
|
+
if lora_path:
|
|
99
|
+
first_adapter = (
|
|
100
|
+
lora_path
|
|
101
|
+
if isinstance(lora_path, str)
|
|
102
|
+
else next((a for a in lora_path if a), None)
|
|
103
|
+
)
|
|
104
|
+
if first_adapter:
|
|
105
|
+
self._validate_lora_enabled(first_adapter)
|
|
95
106
|
|
|
96
107
|
adapted_request = GenerateReqInput(
|
|
97
108
|
**prompt_kwargs,
|
|
@@ -101,13 +112,16 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
|
101
112
|
logprob_start_len=logprob_start_len,
|
|
102
113
|
return_text_in_logprobs=True,
|
|
103
114
|
stream=request.stream,
|
|
104
|
-
lora_path=
|
|
115
|
+
lora_path=lora_path,
|
|
105
116
|
bootstrap_host=request.bootstrap_host,
|
|
106
117
|
bootstrap_port=request.bootstrap_port,
|
|
107
118
|
bootstrap_room=request.bootstrap_room,
|
|
108
119
|
return_hidden_states=request.return_hidden_states,
|
|
109
120
|
rid=request.rid,
|
|
110
|
-
|
|
121
|
+
extra_key=self._compute_extra_key(request),
|
|
122
|
+
priority=request.priority,
|
|
123
|
+
custom_labels=custom_labels,
|
|
124
|
+
custom_logit_processor=request.custom_logit_processor,
|
|
111
125
|
)
|
|
112
126
|
|
|
113
127
|
return adapted_request, request
|
|
@@ -121,6 +135,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
|
121
135
|
"min_new_tokens": request.min_tokens,
|
|
122
136
|
"stop": request.stop,
|
|
123
137
|
"stop_token_ids": request.stop_token_ids,
|
|
138
|
+
"stop_regex": request.stop_regex,
|
|
124
139
|
"top_p": request.top_p,
|
|
125
140
|
"top_k": request.top_k,
|
|
126
141
|
"min_p": request.min_p,
|
|
@@ -135,6 +150,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
|
135
150
|
"ignore_eos": request.ignore_eos,
|
|
136
151
|
"skip_special_tokens": request.skip_special_tokens,
|
|
137
152
|
"logit_bias": request.logit_bias,
|
|
153
|
+
"custom_params": request.custom_params,
|
|
138
154
|
}
|
|
139
155
|
|
|
140
156
|
# Handle response_format constraints
|
|
@@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional,
|
|
|
14
14
|
|
|
15
15
|
import jinja2
|
|
16
16
|
import openai.types.responses as openai_responses_types
|
|
17
|
+
import orjson
|
|
17
18
|
from fastapi import Request
|
|
18
19
|
from fastapi.responses import ORJSONResponse
|
|
19
20
|
from openai.types.responses import (
|
|
@@ -123,6 +124,39 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
|
123
124
|
|
|
124
125
|
self.background_tasks: dict[str, asyncio.Task] = {}
|
|
125
126
|
|
|
127
|
+
# error helpers dedicated for v1/responses
|
|
128
|
+
def create_error_response(
|
|
129
|
+
self,
|
|
130
|
+
message: str,
|
|
131
|
+
err_type: str = "invalid_request_error",
|
|
132
|
+
status_code: int = 400,
|
|
133
|
+
param: Optional[str] = None,
|
|
134
|
+
) -> ORJSONResponse:
|
|
135
|
+
nested_error = {
|
|
136
|
+
"message": message,
|
|
137
|
+
"type": err_type,
|
|
138
|
+
"param": param,
|
|
139
|
+
"code": status_code,
|
|
140
|
+
}
|
|
141
|
+
return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
|
|
142
|
+
|
|
143
|
+
def create_streaming_error_response(
|
|
144
|
+
self,
|
|
145
|
+
message: str,
|
|
146
|
+
err_type: str = "BadRequestError",
|
|
147
|
+
status_code: int = 400,
|
|
148
|
+
) -> str:
|
|
149
|
+
return json.dumps(
|
|
150
|
+
{
|
|
151
|
+
"error": {
|
|
152
|
+
"message": message,
|
|
153
|
+
"type": err_type,
|
|
154
|
+
"param": None,
|
|
155
|
+
"code": status_code,
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
)
|
|
159
|
+
|
|
126
160
|
def _request_id_prefix(self) -> str:
|
|
127
161
|
return "resp_"
|
|
128
162
|
|
|
@@ -245,6 +279,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
|
245
279
|
sampling_params=sampling_params,
|
|
246
280
|
stream=request.stream,
|
|
247
281
|
rid=request.request_id,
|
|
282
|
+
extra_key=self._compute_extra_key(request),
|
|
248
283
|
background=request.background,
|
|
249
284
|
)
|
|
250
285
|
|
|
@@ -744,7 +779,9 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
|
744
779
|
# Update the status to "cancelled"
|
|
745
780
|
response.status = "cancelled"
|
|
746
781
|
|
|
747
|
-
#
|
|
782
|
+
# The response_id is the same as the rid used when submitting the request
|
|
783
|
+
self.tokenizer_manager.abort_request(rid=response_id)
|
|
784
|
+
|
|
748
785
|
if task := self.background_tasks.get(response_id):
|
|
749
786
|
task.cancel()
|
|
750
787
|
try:
|
|
@@ -833,6 +870,13 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
|
833
870
|
|
|
834
871
|
async for ctx in result_generator:
|
|
835
872
|
|
|
873
|
+
# Only process context objects that implement the `is_expecting_start()` method,
|
|
874
|
+
# which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
|
|
875
|
+
# Contexts without this method are skipped, as they do not represent a new turn
|
|
876
|
+
# or are not compatible with per-turn handling in the /v1/responses endpoint.
|
|
877
|
+
if not hasattr(ctx, "is_expecting_start"):
|
|
878
|
+
continue
|
|
879
|
+
|
|
836
880
|
if ctx.is_expecting_start():
|
|
837
881
|
current_output_index += 1
|
|
838
882
|
sent_output_item_added = False
|
|
@@ -1020,7 +1064,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
|
1020
1064
|
):
|
|
1021
1065
|
function_name = previous_item.recipient[len("browser.") :]
|
|
1022
1066
|
action = None
|
|
1023
|
-
parsed_args =
|
|
1067
|
+
parsed_args = orjson.loads(previous_item.content[0].text)
|
|
1024
1068
|
if function_name == "search":
|
|
1025
1069
|
action = openai_responses_types.response_function_web_search.ActionSearch(
|
|
1026
1070
|
type="search",
|
|
@@ -1250,6 +1294,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
|
1250
1294
|
sampling_params=sampling_params,
|
|
1251
1295
|
stream=adapted_request.stream,
|
|
1252
1296
|
rid=request_id,
|
|
1297
|
+
extra_key=adapted_request.extra_key,
|
|
1253
1298
|
return_logprob=adapted_request.return_logprob,
|
|
1254
1299
|
logprob_start_len=adapted_request.logprob_start_len,
|
|
1255
1300
|
top_logprobs_num=adapted_request.top_logprobs_num,
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from http import HTTPStatus
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
from fastapi import Request
|
|
6
|
+
|
|
7
|
+
from sglang.srt.entrypoints.openai.protocol import (
|
|
8
|
+
DetokenizeRequest,
|
|
9
|
+
DetokenizeResponse,
|
|
10
|
+
ErrorResponse,
|
|
11
|
+
TokenizeRequest,
|
|
12
|
+
TokenizeResponse,
|
|
13
|
+
)
|
|
14
|
+
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OpenAIServingTokenize(OpenAIServingBase):
|
|
20
|
+
"""Handler for /v1/tokenize requests"""
|
|
21
|
+
|
|
22
|
+
def _request_id_prefix(self) -> str:
|
|
23
|
+
return "tok-"
|
|
24
|
+
|
|
25
|
+
def _convert_to_internal_request(
|
|
26
|
+
self, request: TokenizeRequest, raw_request: Request
|
|
27
|
+
) -> tuple[TokenizeRequest, TokenizeRequest]:
|
|
28
|
+
return request, request
|
|
29
|
+
|
|
30
|
+
async def _handle_non_streaming_request(
|
|
31
|
+
self,
|
|
32
|
+
adapted_request: TokenizeRequest,
|
|
33
|
+
request: TokenizeRequest,
|
|
34
|
+
raw_request: Request,
|
|
35
|
+
) -> Union[TokenizeResponse, ErrorResponse]:
|
|
36
|
+
try:
|
|
37
|
+
tokenizer = self.tokenizer_manager.tokenizer
|
|
38
|
+
max_model_len = getattr(tokenizer, "model_max_length", -1)
|
|
39
|
+
|
|
40
|
+
if isinstance(request.prompt, str):
|
|
41
|
+
token_ids = tokenizer.encode(
|
|
42
|
+
request.prompt,
|
|
43
|
+
add_special_tokens=request.add_special_tokens,
|
|
44
|
+
)
|
|
45
|
+
tokens = token_ids
|
|
46
|
+
count = len(token_ids)
|
|
47
|
+
elif isinstance(request.prompt, list):
|
|
48
|
+
token_ids_list = [
|
|
49
|
+
tokenizer.encode(
|
|
50
|
+
text, add_special_tokens=request.add_special_tokens
|
|
51
|
+
)
|
|
52
|
+
for text in request.prompt
|
|
53
|
+
]
|
|
54
|
+
tokens = token_ids_list
|
|
55
|
+
count = [len(ids) for ids in token_ids_list]
|
|
56
|
+
else:
|
|
57
|
+
return self.create_error_response(
|
|
58
|
+
f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return TokenizeResponse(
|
|
62
|
+
tokens=tokens, count=count, max_model_len=max_model_len
|
|
63
|
+
)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error("Error during tokenization", exc_info=True)
|
|
66
|
+
return self.create_error_response(
|
|
67
|
+
f"Internal server error during tokenization: {e}",
|
|
68
|
+
err_type="InternalServerError",
|
|
69
|
+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class OpenAIServingDetokenize(OpenAIServingBase):
|
|
74
|
+
"""Handler for /v1/detokenize requests"""
|
|
75
|
+
|
|
76
|
+
def _request_id_prefix(self) -> str:
|
|
77
|
+
return "detok-"
|
|
78
|
+
|
|
79
|
+
def _convert_to_internal_request(
|
|
80
|
+
self, request: DetokenizeRequest, raw_request: Request
|
|
81
|
+
) -> tuple[DetokenizeRequest, DetokenizeRequest]:
|
|
82
|
+
return request, request
|
|
83
|
+
|
|
84
|
+
async def _handle_non_streaming_request(
|
|
85
|
+
self,
|
|
86
|
+
adapted_request: DetokenizeRequest,
|
|
87
|
+
request: DetokenizeRequest,
|
|
88
|
+
raw_request: Request,
|
|
89
|
+
) -> Union[DetokenizeResponse, ErrorResponse]:
|
|
90
|
+
try:
|
|
91
|
+
tokenizer = self.tokenizer_manager.tokenizer
|
|
92
|
+
|
|
93
|
+
if (
|
|
94
|
+
isinstance(request.tokens, list)
|
|
95
|
+
and request.tokens
|
|
96
|
+
and isinstance(request.tokens[0], int)
|
|
97
|
+
):
|
|
98
|
+
if not all(isinstance(t, int) for t in request.tokens):
|
|
99
|
+
return self.create_error_response(
|
|
100
|
+
"Invalid input: 'tokens' must be a list of integers."
|
|
101
|
+
)
|
|
102
|
+
tokens_to_decode = [int(t) for t in request.tokens]
|
|
103
|
+
text = tokenizer.decode(
|
|
104
|
+
tokens_to_decode, skip_special_tokens=request.skip_special_tokens
|
|
105
|
+
)
|
|
106
|
+
text_out: Union[str, List[str]] = text
|
|
107
|
+
elif (
|
|
108
|
+
isinstance(request.tokens, list)
|
|
109
|
+
and request.tokens
|
|
110
|
+
and isinstance(request.tokens[0], list)
|
|
111
|
+
):
|
|
112
|
+
texts: List[str] = []
|
|
113
|
+
for token_list in request.tokens:
|
|
114
|
+
if not all(isinstance(t, int) for t in token_list):
|
|
115
|
+
return self.create_error_response(
|
|
116
|
+
f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
|
|
117
|
+
)
|
|
118
|
+
decoded_text = tokenizer.decode(
|
|
119
|
+
[int(t) for t in token_list],
|
|
120
|
+
skip_special_tokens=request.skip_special_tokens,
|
|
121
|
+
)
|
|
122
|
+
texts.append(decoded_text)
|
|
123
|
+
text_out = texts
|
|
124
|
+
elif isinstance(request.tokens, list) and not request.tokens:
|
|
125
|
+
text_out = ""
|
|
126
|
+
else:
|
|
127
|
+
return self.create_error_response(
|
|
128
|
+
f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return DetokenizeResponse(text=text_out)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error("Error during detokenization", exc_info=True)
|
|
134
|
+
if "decode" in str(e).lower():
|
|
135
|
+
return self.create_error_response(
|
|
136
|
+
f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
|
|
137
|
+
err_type="DecodeError",
|
|
138
|
+
status_code=HTTPStatus.BAD_REQUEST,
|
|
139
|
+
)
|
|
140
|
+
return self.create_error_response(
|
|
141
|
+
f"Internal server error during detokenization: {e}",
|
|
142
|
+
err_type="InternalServerError",
|
|
143
|
+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
|
144
|
+
)
|