sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -13,15 +13,18 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
"""Pydantic models for OpenAI API protocol"""
|
15
15
|
|
16
|
+
import logging
|
16
17
|
import time
|
17
18
|
import uuid
|
18
19
|
from dataclasses import dataclass
|
19
|
-
from typing import Any, Dict, List, Optional, TypeAlias, Union
|
20
|
+
from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
|
20
21
|
|
21
22
|
from openai.types.responses import (
|
22
23
|
ResponseFunctionToolCall,
|
23
24
|
ResponseInputItemParam,
|
24
25
|
ResponseOutputItem,
|
26
|
+
ResponseOutputMessage,
|
27
|
+
ResponseOutputText,
|
25
28
|
ResponseReasoningItem,
|
26
29
|
)
|
27
30
|
from openai.types.responses.response import ToolChoice
|
@@ -35,6 +38,10 @@ from pydantic import (
|
|
35
38
|
)
|
36
39
|
from typing_extensions import Literal
|
37
40
|
|
41
|
+
from sglang.utils import convert_json_schema_to_str
|
42
|
+
|
43
|
+
logger = logging.getLogger(__name__)
|
44
|
+
|
38
45
|
DEFAULT_MODEL_NAME = "default"
|
39
46
|
|
40
47
|
|
@@ -228,6 +235,15 @@ class CompletionRequest(BaseModel):
|
|
228
235
|
|
229
236
|
# For request id
|
230
237
|
rid: Optional[Union[List[str], str]] = None
|
238
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
239
|
+
extra_key: Optional[Union[List[str], str]] = None
|
240
|
+
# Cache salt for request caching
|
241
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
242
|
+
# Priority for the request
|
243
|
+
priority: Optional[int] = None
|
244
|
+
|
245
|
+
# For custom metric labels
|
246
|
+
custom_labels: Optional[Dict[str, str]] = None
|
231
247
|
|
232
248
|
@field_validator("max_tokens")
|
233
249
|
@classmethod
|
@@ -334,7 +350,7 @@ class FunctionResponse(BaseModel):
|
|
334
350
|
"""Function response."""
|
335
351
|
|
336
352
|
name: Optional[str] = None
|
337
|
-
arguments: Optional[str] = None
|
353
|
+
arguments: Optional[str | Dict[str, Any]] = None
|
338
354
|
|
339
355
|
|
340
356
|
class ToolCall(BaseModel):
|
@@ -383,7 +399,7 @@ class Function(BaseModel):
|
|
383
399
|
"""Function descriptions."""
|
384
400
|
|
385
401
|
description: Optional[str] = Field(default=None, examples=[None])
|
386
|
-
name:
|
402
|
+
name: str
|
387
403
|
parameters: Optional[object] = None
|
388
404
|
strict: bool = False
|
389
405
|
|
@@ -434,8 +450,8 @@ class ChatCompletionRequest(BaseModel):
|
|
434
450
|
stop: Optional[Union[str, List[str]]] = None
|
435
451
|
stream: bool = False
|
436
452
|
stream_options: Optional[StreamOptions] = None
|
437
|
-
temperature: float =
|
438
|
-
top_p: float =
|
453
|
+
temperature: Optional[float] = None
|
454
|
+
top_p: Optional[float] = None
|
439
455
|
user: Optional[str] = None
|
440
456
|
tools: Optional[List[Tool]] = Field(default=None, examples=[None])
|
441
457
|
tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
|
@@ -447,9 +463,50 @@ class ChatCompletionRequest(BaseModel):
|
|
447
463
|
description="Constrains effort on reasoning for reasoning models. "
|
448
464
|
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
449
465
|
"result in faster responses and fewer tokens used on reasoning in a response. "
|
450
|
-
"Currently only supported for OpenAI models.",
|
466
|
+
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
|
451
467
|
)
|
452
468
|
|
469
|
+
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
470
|
+
top_k: Optional[int] = None
|
471
|
+
min_p: Optional[float] = None
|
472
|
+
min_tokens: int = 0
|
473
|
+
regex: Optional[str] = None
|
474
|
+
ebnf: Optional[str] = None
|
475
|
+
repetition_penalty: Optional[float] = None
|
476
|
+
stop_token_ids: Optional[List[int]] = None
|
477
|
+
no_stop_trim: bool = False
|
478
|
+
ignore_eos: bool = False
|
479
|
+
continue_final_message: bool = False
|
480
|
+
skip_special_tokens: bool = True
|
481
|
+
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
482
|
+
session_params: Optional[Dict] = None
|
483
|
+
separate_reasoning: bool = True
|
484
|
+
stream_reasoning: bool = True
|
485
|
+
chat_template_kwargs: Optional[Dict] = None
|
486
|
+
|
487
|
+
# For request id
|
488
|
+
rid: Optional[Union[List[str], str]] = None
|
489
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
490
|
+
extra_key: Optional[Union[List[str], str]] = None
|
491
|
+
# Cache salt for request caching
|
492
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
493
|
+
# Priority for the request
|
494
|
+
priority: Optional[int] = None
|
495
|
+
|
496
|
+
# For PD disaggregation
|
497
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
498
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
499
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
500
|
+
|
501
|
+
# OpenAI/SGLang default sampling parameters
|
502
|
+
_DEFAULT_SAMPLING_PARAMS = {
|
503
|
+
"temperature": 1.0,
|
504
|
+
"top_p": 1.0,
|
505
|
+
"top_k": -1,
|
506
|
+
"min_p": 0.0,
|
507
|
+
"repetition_penalty": 1.0,
|
508
|
+
}
|
509
|
+
|
453
510
|
@model_validator(mode="before")
|
454
511
|
@classmethod
|
455
512
|
def set_tool_choice_default(cls, values):
|
@@ -520,31 +577,81 @@ class ChatCompletionRequest(BaseModel):
|
|
520
577
|
|
521
578
|
return values
|
522
579
|
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
580
|
+
def to_sampling_params(
|
581
|
+
self,
|
582
|
+
stop: List[str],
|
583
|
+
model_generation_config: Dict[str, Any],
|
584
|
+
tool_call_constraint: Optional[Any] = None,
|
585
|
+
) -> Dict[str, Any]:
|
586
|
+
"""
|
587
|
+
Convert request to sampling parameters.
|
588
|
+
Priority: user value > model generation_config > OpenAI defaults
|
589
|
+
"""
|
590
|
+
|
591
|
+
def get_param(param_name: str):
|
592
|
+
value = getattr(self, param_name)
|
593
|
+
if value is None:
|
594
|
+
return model_generation_config.get(
|
595
|
+
param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
|
596
|
+
)
|
597
|
+
return value
|
598
|
+
|
599
|
+
sampling_params = {
|
600
|
+
"temperature": get_param("temperature"),
|
601
|
+
"max_new_tokens": self.max_tokens or self.max_completion_tokens,
|
602
|
+
"min_new_tokens": self.min_tokens,
|
603
|
+
"stop": stop,
|
604
|
+
"stop_token_ids": self.stop_token_ids,
|
605
|
+
"top_p": get_param("top_p"),
|
606
|
+
"top_k": get_param("top_k"),
|
607
|
+
"min_p": get_param("min_p"),
|
608
|
+
"presence_penalty": self.presence_penalty,
|
609
|
+
"frequency_penalty": self.frequency_penalty,
|
610
|
+
"repetition_penalty": get_param("repetition_penalty"),
|
611
|
+
"regex": self.regex,
|
612
|
+
"ebnf": self.ebnf,
|
613
|
+
"n": self.n,
|
614
|
+
"no_stop_trim": self.no_stop_trim,
|
615
|
+
"ignore_eos": self.ignore_eos,
|
616
|
+
"skip_special_tokens": self.skip_special_tokens,
|
617
|
+
"logit_bias": self.logit_bias,
|
618
|
+
}
|
540
619
|
|
541
|
-
|
542
|
-
|
620
|
+
if self.response_format and self.response_format.type == "json_schema":
|
621
|
+
sampling_params["json_schema"] = convert_json_schema_to_str(
|
622
|
+
self.response_format.json_schema.schema_
|
623
|
+
)
|
624
|
+
elif self.response_format and self.response_format.type == "json_object":
|
625
|
+
sampling_params["json_schema"] = '{"type": "object"}'
|
626
|
+
elif self.response_format and self.response_format.type == "structural_tag":
|
627
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
628
|
+
self.response_format.model_dump(by_alias=True)
|
629
|
+
)
|
543
630
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
631
|
+
# Check if there are already existing output constraints
|
632
|
+
has_existing_constraints = (
|
633
|
+
sampling_params.get("regex")
|
634
|
+
or sampling_params.get("ebnf")
|
635
|
+
or sampling_params.get("structural_tag")
|
636
|
+
or sampling_params.get("json_schema")
|
637
|
+
)
|
638
|
+
|
639
|
+
if tool_call_constraint and has_existing_constraints:
|
640
|
+
logger.warning("Constrained decoding is not compatible with tool calls.")
|
641
|
+
elif tool_call_constraint:
|
642
|
+
constraint_type, constraint_value = tool_call_constraint
|
643
|
+
if constraint_type == "structural_tag":
|
644
|
+
sampling_params[constraint_type] = convert_json_schema_to_str(
|
645
|
+
constraint_value.model_dump(by_alias=True)
|
646
|
+
)
|
647
|
+
elif constraint_type == "json_schema":
|
648
|
+
sampling_params[constraint_type] = convert_json_schema_to_str(
|
649
|
+
constraint_value
|
650
|
+
)
|
651
|
+
else:
|
652
|
+
sampling_params[constraint_type] = constraint_value
|
653
|
+
|
654
|
+
return sampling_params
|
548
655
|
|
549
656
|
|
550
657
|
class ChatMessage(BaseModel):
|
@@ -641,6 +748,8 @@ class EmbeddingRequest(BaseModel):
|
|
641
748
|
|
642
749
|
# The request id.
|
643
750
|
rid: Optional[Union[List[str], str]] = None
|
751
|
+
# Priority for the request
|
752
|
+
priority: Optional[int] = None
|
644
753
|
|
645
754
|
|
646
755
|
class EmbeddingObject(BaseModel):
|
@@ -692,12 +801,50 @@ class RerankResponse(BaseModel):
|
|
692
801
|
meta_info: Optional[dict] = None
|
693
802
|
|
694
803
|
|
804
|
+
class TokenizeRequest(BaseModel):
|
805
|
+
"""Request schema for the /tokenize endpoint."""
|
806
|
+
|
807
|
+
model: str = DEFAULT_MODEL_NAME
|
808
|
+
prompt: Union[str, List[str]]
|
809
|
+
add_special_tokens: bool = Field(
|
810
|
+
default=True,
|
811
|
+
description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
|
812
|
+
)
|
813
|
+
|
814
|
+
|
815
|
+
class TokenizeResponse(BaseModel):
|
816
|
+
"""Response schema for the /tokenize endpoint."""
|
817
|
+
|
818
|
+
tokens: Union[List[int], List[List[int]]]
|
819
|
+
count: Union[int, List[int]]
|
820
|
+
max_model_len: int
|
821
|
+
|
822
|
+
|
823
|
+
class DetokenizeRequest(BaseModel):
|
824
|
+
"""Request schema for the /detokenize endpoint."""
|
825
|
+
|
826
|
+
model: str = DEFAULT_MODEL_NAME
|
827
|
+
tokens: Union[List[int], List[List[int]]]
|
828
|
+
skip_special_tokens: bool = Field(
|
829
|
+
default=True,
|
830
|
+
description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
|
831
|
+
)
|
832
|
+
|
833
|
+
|
834
|
+
class DetokenizeResponse(BaseModel):
|
835
|
+
"""Response schema for the /detokenize endpoint."""
|
836
|
+
|
837
|
+
text: Union[str, List[str]]
|
838
|
+
|
839
|
+
|
695
840
|
OpenAIServingRequest = Union[
|
696
841
|
ChatCompletionRequest,
|
697
842
|
CompletionRequest,
|
698
843
|
EmbeddingRequest,
|
699
844
|
ScoringRequest,
|
700
845
|
V1RerankReqInput,
|
846
|
+
TokenizeRequest,
|
847
|
+
DetokenizeRequest,
|
701
848
|
]
|
702
849
|
|
703
850
|
|
@@ -769,6 +916,13 @@ class ResponsesRequest(BaseModel):
|
|
769
916
|
description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
|
770
917
|
)
|
771
918
|
priority: int = Field(default=0, description="Request priority")
|
919
|
+
extra_key: Optional[str] = Field(
|
920
|
+
default=None,
|
921
|
+
description="Extra key for classifying the request (e.g. cache_salt)",
|
922
|
+
)
|
923
|
+
cache_salt: Optional[str] = Field(
|
924
|
+
default=None, description="Cache salt for request caching"
|
925
|
+
)
|
772
926
|
|
773
927
|
# SGLang-specific sampling parameters
|
774
928
|
frequency_penalty: float = 0.0
|
@@ -857,6 +1011,26 @@ class ResponsesResponse(BaseModel):
|
|
857
1011
|
tool_choice: str = "auto"
|
858
1012
|
tools: List[ResponseTool] = Field(default_factory=list)
|
859
1013
|
|
1014
|
+
# OpenAI compatibility fields. not all are used at the moment.
|
1015
|
+
# Recommend checking https://platform.openai.com/docs/api-reference/responses
|
1016
|
+
error: Optional[dict] = None
|
1017
|
+
incomplete_details: Optional[dict] = None # TODO(v) support this input
|
1018
|
+
instructions: Optional[str] = None
|
1019
|
+
max_output_tokens: Optional[int] = None
|
1020
|
+
previous_response_id: Optional[str] = None
|
1021
|
+
reasoning: Optional[dict] = (
|
1022
|
+
# Unused. No model supports this. For GPT-oss, system prompt sets
|
1023
|
+
# the field, not server args.
|
1024
|
+
None # {"effort": Optional[str], "summary": Optional[str]}
|
1025
|
+
)
|
1026
|
+
store: Optional[bool] = None
|
1027
|
+
temperature: Optional[float] = None
|
1028
|
+
text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
|
1029
|
+
top_p: Optional[float] = None
|
1030
|
+
truncation: Optional[str] = None
|
1031
|
+
user: Optional[str] = None
|
1032
|
+
metadata: Optional[Dict[str, Any]] = None
|
1033
|
+
|
860
1034
|
@classmethod
|
861
1035
|
def from_request(
|
862
1036
|
cls,
|
@@ -871,6 +1045,41 @@ class ResponsesResponse(BaseModel):
|
|
871
1045
|
usage: Optional[UsageInfo],
|
872
1046
|
) -> "ResponsesResponse":
|
873
1047
|
"""Create a response from a request."""
|
1048
|
+
|
1049
|
+
# Determine if the output is plain text only to set text.format
|
1050
|
+
def _is_text_only(
|
1051
|
+
items: List[
|
1052
|
+
Union[
|
1053
|
+
ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
|
1054
|
+
]
|
1055
|
+
]
|
1056
|
+
) -> bool:
|
1057
|
+
if not items:
|
1058
|
+
return False
|
1059
|
+
for it in items:
|
1060
|
+
# tool call -> not pure text.
|
1061
|
+
if isinstance(it, ResponseReasoningItem) or isinstance(
|
1062
|
+
it, ResponseFunctionToolCall
|
1063
|
+
):
|
1064
|
+
return False
|
1065
|
+
try:
|
1066
|
+
if isinstance(it, ResponseOutputText):
|
1067
|
+
continue
|
1068
|
+
elif isinstance(it, ResponseOutputMessage):
|
1069
|
+
if not it.content:
|
1070
|
+
continue
|
1071
|
+
for c in it.content:
|
1072
|
+
if not isinstance(c, ResponseOutputText):
|
1073
|
+
return False
|
1074
|
+
else:
|
1075
|
+
# Unknown type, not considered text-only
|
1076
|
+
return False
|
1077
|
+
except AttributeError:
|
1078
|
+
return False
|
1079
|
+
return True
|
1080
|
+
|
1081
|
+
text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
|
1082
|
+
|
874
1083
|
return cls(
|
875
1084
|
id=request.request_id,
|
876
1085
|
created_at=created_time,
|
@@ -881,6 +1090,23 @@ class ResponsesResponse(BaseModel):
|
|
881
1090
|
parallel_tool_calls=request.parallel_tool_calls or True,
|
882
1091
|
tool_choice=request.tool_choice,
|
883
1092
|
tools=request.tools,
|
1093
|
+
# fields for parity with v1/responses
|
1094
|
+
error=None,
|
1095
|
+
incomplete_details=None,
|
1096
|
+
instructions=request.instructions,
|
1097
|
+
max_output_tokens=request.max_output_tokens,
|
1098
|
+
previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
|
1099
|
+
reasoning={
|
1100
|
+
"effort": request.reasoning.effort if request.reasoning else None,
|
1101
|
+
"summary": None, # unused
|
1102
|
+
},
|
1103
|
+
store=request.store,
|
1104
|
+
temperature=request.temperature,
|
1105
|
+
text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
|
1106
|
+
top_p=request.top_p,
|
1107
|
+
truncation=request.truncation,
|
1108
|
+
user=request.user,
|
1109
|
+
metadata=request.metadata or {},
|
884
1110
|
)
|
885
1111
|
|
886
1112
|
|
@@ -919,6 +1145,16 @@ class MessageProcessingResult:
|
|
919
1145
|
tool_call_constraint: Optional[Any] = None
|
920
1146
|
|
921
1147
|
|
1148
|
+
class ToolCallProcessingResult(NamedTuple):
|
1149
|
+
"""Result of processing tool calls in a response."""
|
1150
|
+
|
1151
|
+
tool_calls: Optional[
|
1152
|
+
List[Any]
|
1153
|
+
] # List of ToolCall objects or None if parsing failed
|
1154
|
+
remaining_text: str # Text remaining after parsing tool calls
|
1155
|
+
finish_reason: Dict[str, Any] # Updated finish reason dictionary
|
1156
|
+
|
1157
|
+
|
922
1158
|
class ResponseReasoningTextContent(BaseModel):
|
923
1159
|
text: str
|
924
1160
|
type: Literal["reasoning_text"] = "reasoning_text"
|
@@ -1,15 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
from sglang.srt.
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
18
|
|
14
19
|
logger = logging.getLogger(__name__)
|
15
20
|
|
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
|
|
20
25
|
|
21
26
|
def __init__(self, tokenizer_manager: TokenizerManager):
|
22
27
|
self.tokenizer_manager = tokenizer_manager
|
28
|
+
self.allowed_custom_labels = (
|
29
|
+
set(
|
30
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
31
|
+
)
|
32
|
+
if isinstance(self.tokenizer_manager.server_args, ServerArgs)
|
33
|
+
and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
|
34
|
+
else None
|
35
|
+
)
|
23
36
|
|
24
37
|
async def handle_request(
|
25
38
|
self, request: OpenAIServingRequest, raw_request: Request
|
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
|
|
33
46
|
|
34
47
|
# Convert to internal format
|
35
48
|
adapted_request, processed_request = self._convert_to_internal_request(
|
36
|
-
request
|
49
|
+
request, raw_request
|
37
50
|
)
|
38
51
|
|
39
52
|
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
|
@@ -49,6 +62,12 @@ class OpenAIServingBase(ABC):
|
|
49
62
|
return self.create_error_response(
|
50
63
|
message=e.detail, err_type=str(e.status_code), status_code=e.status_code
|
51
64
|
)
|
65
|
+
except ValueError as e:
|
66
|
+
return self.create_error_response(
|
67
|
+
message=str(e),
|
68
|
+
err_type="BadRequest",
|
69
|
+
status_code=400,
|
70
|
+
)
|
52
71
|
except Exception as e:
|
53
72
|
logger.exception(f"Error in request: {e}")
|
54
73
|
return self.create_error_response(
|
@@ -73,10 +92,24 @@ class OpenAIServingBase(ABC):
|
|
73
92
|
|
74
93
|
return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
|
75
94
|
|
95
|
+
def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
|
96
|
+
"""Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
|
97
|
+
parts = []
|
98
|
+
for key in ["cache_salt", "extra_key"]:
|
99
|
+
value = getattr(request, key, None)
|
100
|
+
if value:
|
101
|
+
if not isinstance(value, str):
|
102
|
+
raise TypeError(
|
103
|
+
f"Value of {key} must be a string, but got {type(value).__name__}"
|
104
|
+
)
|
105
|
+
parts.append(value)
|
106
|
+
return "".join(parts) if parts else None
|
107
|
+
|
76
108
|
@abstractmethod
|
77
109
|
def _convert_to_internal_request(
|
78
110
|
self,
|
79
111
|
request: OpenAIServingRequest,
|
112
|
+
raw_request: Request = None,
|
80
113
|
) -> tuple[GenerateReqInput, OpenAIServingRequest]:
|
81
114
|
"""Convert OpenAI request to internal format"""
|
82
115
|
pass
|
@@ -150,3 +183,32 @@ class OpenAIServingBase(ABC):
|
|
150
183
|
code=status_code,
|
151
184
|
)
|
152
185
|
return json.dumps({"error": error.model_dump()})
|
186
|
+
|
187
|
+
def extract_custom_labels(self, raw_request):
|
188
|
+
if (
|
189
|
+
not self.allowed_custom_labels
|
190
|
+
or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
191
|
+
):
|
192
|
+
return None
|
193
|
+
|
194
|
+
custom_labels = None
|
195
|
+
header = (
|
196
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
197
|
+
)
|
198
|
+
try:
|
199
|
+
raw_labels = (
|
200
|
+
json.loads(raw_request.headers.get(header))
|
201
|
+
if raw_request and raw_request.headers.get(header)
|
202
|
+
else None
|
203
|
+
)
|
204
|
+
except json.JSONDecodeError as e:
|
205
|
+
logger.exception(f"Error in request: {e}")
|
206
|
+
raw_labels = None
|
207
|
+
|
208
|
+
if isinstance(raw_labels, dict):
|
209
|
+
custom_labels = {
|
210
|
+
label: value
|
211
|
+
for label, value in raw_labels.items()
|
212
|
+
if label in self.allowed_custom_labels
|
213
|
+
}
|
214
|
+
return custom_labels
|