sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py
CHANGED
@@ -9,15 +9,18 @@ import os
|
|
9
9
|
import random
|
10
10
|
import re
|
11
11
|
import subprocess
|
12
|
+
import sys
|
12
13
|
import threading
|
13
14
|
import time
|
14
15
|
import unittest
|
15
16
|
from concurrent.futures import ThreadPoolExecutor
|
16
17
|
from dataclasses import dataclass
|
18
|
+
from datetime import datetime
|
17
19
|
from functools import partial
|
18
20
|
from pathlib import Path
|
19
21
|
from types import SimpleNamespace
|
20
|
-
from typing import Awaitable, Callable, List, Optional, Tuple
|
22
|
+
from typing import Any, Awaitable, Callable, List, Optional, Tuple
|
23
|
+
from urllib.parse import quote
|
21
24
|
|
22
25
|
import aiohttp
|
23
26
|
import numpy as np
|
@@ -41,8 +44,10 @@ from sglang.utils import get_exception_traceback
|
|
41
44
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
42
45
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
43
46
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
47
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
|
44
48
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
45
|
-
|
49
|
+
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
|
50
|
+
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
46
51
|
|
47
52
|
# MLA test models
|
48
53
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
@@ -52,6 +57,9 @@ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instru
|
|
52
57
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
|
53
58
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
|
54
59
|
|
60
|
+
# NVFP4 models
|
61
|
+
DEFAULT_DEEPSEEK_NVFP4_MODEL_FOR_TEST = "nvidia/DeepSeek-R1-0528-FP4"
|
62
|
+
|
55
63
|
# FP8 models
|
56
64
|
DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
57
65
|
DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
@@ -71,7 +79,13 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
|
71
79
|
# EAGLE
|
72
80
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
73
81
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
74
|
-
|
82
|
+
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
|
83
|
+
DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
|
84
|
+
DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
|
85
|
+
"meta-llama/Llama-3.1-8B-Instruct"
|
86
|
+
)
|
87
|
+
DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
88
|
+
DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
75
89
|
|
76
90
|
# Other use cases
|
77
91
|
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
@@ -466,6 +480,25 @@ def try_cached_model(model_repo: str):
|
|
466
480
|
return model_dir if model_dir else model_repo
|
467
481
|
|
468
482
|
|
483
|
+
def popen_with_error_check(command: list[str], allow_exit: bool = False):
|
484
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
485
|
+
|
486
|
+
def _run_and_check():
|
487
|
+
stdout, stderr = process.communicate()
|
488
|
+
|
489
|
+
while process.poll() is None:
|
490
|
+
time.sleep(5)
|
491
|
+
|
492
|
+
if not allow_exit or process.returncode != 0:
|
493
|
+
raise Exception(
|
494
|
+
f"{command} exited with code {process.returncode}\n{stdout=}\n{stderr=}"
|
495
|
+
)
|
496
|
+
|
497
|
+
t = threading.Thread(target=_run_and_check)
|
498
|
+
t.start()
|
499
|
+
return process
|
500
|
+
|
501
|
+
|
469
502
|
def popen_launch_server(
|
470
503
|
model: str,
|
471
504
|
base_url: str,
|
@@ -534,11 +567,30 @@ def popen_launch_server(
|
|
534
567
|
if return_stdout_stderr:
|
535
568
|
process = subprocess.Popen(
|
536
569
|
command,
|
537
|
-
stdout=
|
538
|
-
stderr=
|
570
|
+
stdout=subprocess.PIPE,
|
571
|
+
stderr=subprocess.PIPE,
|
539
572
|
env=env,
|
540
573
|
text=True,
|
574
|
+
bufsize=1,
|
541
575
|
)
|
576
|
+
|
577
|
+
def _dump(src, sinks):
|
578
|
+
for line in iter(src.readline, ""):
|
579
|
+
for sink in sinks:
|
580
|
+
sink.write(line)
|
581
|
+
sink.flush()
|
582
|
+
src.close()
|
583
|
+
|
584
|
+
threading.Thread(
|
585
|
+
target=_dump,
|
586
|
+
args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
|
587
|
+
daemon=True,
|
588
|
+
).start()
|
589
|
+
threading.Thread(
|
590
|
+
target=_dump,
|
591
|
+
args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
|
592
|
+
daemon=True,
|
593
|
+
).start()
|
542
594
|
else:
|
543
595
|
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
544
596
|
|
@@ -842,6 +894,154 @@ def run_bench_serving(
|
|
842
894
|
return res
|
843
895
|
|
844
896
|
|
897
|
+
def run_score_benchmark(
|
898
|
+
model,
|
899
|
+
num_requests=100,
|
900
|
+
batch_size=5,
|
901
|
+
other_server_args=None,
|
902
|
+
need_warmup=False,
|
903
|
+
device="auto",
|
904
|
+
):
|
905
|
+
"""Score API benchmark function compatible with run_bench_serving pattern"""
|
906
|
+
if other_server_args is None:
|
907
|
+
other_server_args = []
|
908
|
+
|
909
|
+
if device == "auto":
|
910
|
+
device = auto_config_device()
|
911
|
+
|
912
|
+
# Launch the server (consistent with run_bench_serving)
|
913
|
+
base_url = DEFAULT_URL_FOR_TEST
|
914
|
+
process = popen_launch_server(
|
915
|
+
model,
|
916
|
+
base_url,
|
917
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
918
|
+
other_args=other_server_args,
|
919
|
+
)
|
920
|
+
|
921
|
+
async def _run_benchmark():
|
922
|
+
|
923
|
+
# Load tokenizer for generating test data
|
924
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
925
|
+
|
926
|
+
tokenizer = get_tokenizer(model)
|
927
|
+
|
928
|
+
# Score API configuration
|
929
|
+
score_query_tokens = 120
|
930
|
+
score_item_tokens = 180
|
931
|
+
score_label_token_ids = [9454, 2753] # Yes/No token IDs
|
932
|
+
special_token = "<|im_start|>"
|
933
|
+
|
934
|
+
def generate_text_with_token_count(num_tokens):
|
935
|
+
"""Generate text with precise token count using replicated token."""
|
936
|
+
text = special_token * num_tokens
|
937
|
+
actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
|
938
|
+
if actual_tokens != num_tokens:
|
939
|
+
text = special_token * (
|
940
|
+
num_tokens
|
941
|
+
// len(tokenizer.encode(special_token, add_special_tokens=False))
|
942
|
+
)
|
943
|
+
return text
|
944
|
+
|
945
|
+
if need_warmup:
|
946
|
+
warmup_data = {
|
947
|
+
"query": generate_text_with_token_count(score_query_tokens),
|
948
|
+
"items": [
|
949
|
+
generate_text_with_token_count(score_item_tokens) for _ in range(3)
|
950
|
+
],
|
951
|
+
"label_token_ids": score_label_token_ids,
|
952
|
+
"model": model,
|
953
|
+
"apply_softmax": True,
|
954
|
+
}
|
955
|
+
|
956
|
+
async with aiohttp.ClientSession() as session:
|
957
|
+
try:
|
958
|
+
await session.post(
|
959
|
+
f"{base_url}/v1/score",
|
960
|
+
json=warmup_data,
|
961
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
962
|
+
)
|
963
|
+
except:
|
964
|
+
pass # Ignore warmup errors
|
965
|
+
|
966
|
+
test_requests = []
|
967
|
+
for i in range(num_requests):
|
968
|
+
query = generate_text_with_token_count(score_query_tokens)
|
969
|
+
items = [
|
970
|
+
generate_text_with_token_count(score_item_tokens)
|
971
|
+
for _ in range(batch_size)
|
972
|
+
]
|
973
|
+
|
974
|
+
score_data = {
|
975
|
+
"query": query,
|
976
|
+
"items": items,
|
977
|
+
"label_token_ids": score_label_token_ids,
|
978
|
+
"model": model,
|
979
|
+
"apply_softmax": True,
|
980
|
+
}
|
981
|
+
test_requests.append(score_data)
|
982
|
+
|
983
|
+
start_time = time.monotonic()
|
984
|
+
successful_requests = 0
|
985
|
+
total_latency = 0
|
986
|
+
latencies = []
|
987
|
+
|
988
|
+
async with aiohttp.ClientSession() as session:
|
989
|
+
for request_data in test_requests:
|
990
|
+
try:
|
991
|
+
request_start = time.monotonic()
|
992
|
+
async with session.post(
|
993
|
+
f"{base_url}/v1/score",
|
994
|
+
json=request_data,
|
995
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
996
|
+
) as response:
|
997
|
+
if response.status == 200:
|
998
|
+
response_data = await response.json()
|
999
|
+
request_end = time.monotonic()
|
1000
|
+
|
1001
|
+
if "scores" in response_data or "logprobs" in response_data:
|
1002
|
+
latency_ms = (request_end - request_start) * 1000
|
1003
|
+
latencies.append(latency_ms)
|
1004
|
+
total_latency += latency_ms
|
1005
|
+
successful_requests += 1
|
1006
|
+
except Exception:
|
1007
|
+
continue
|
1008
|
+
|
1009
|
+
end_time = time.monotonic()
|
1010
|
+
total_time = end_time - start_time
|
1011
|
+
|
1012
|
+
if successful_requests > 0:
|
1013
|
+
throughput = successful_requests / total_time
|
1014
|
+
avg_latency = total_latency / successful_requests
|
1015
|
+
latencies.sort()
|
1016
|
+
p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
|
1017
|
+
|
1018
|
+
return {
|
1019
|
+
"completed": successful_requests,
|
1020
|
+
"total_requests": num_requests,
|
1021
|
+
"throughput": throughput,
|
1022
|
+
"avg_latency_ms": avg_latency,
|
1023
|
+
"p95_latency_ms": p95_latency,
|
1024
|
+
"successful_requests": successful_requests,
|
1025
|
+
}
|
1026
|
+
else:
|
1027
|
+
return {
|
1028
|
+
"completed": 0,
|
1029
|
+
"total_requests": num_requests,
|
1030
|
+
"throughput": 0,
|
1031
|
+
"avg_latency_ms": 0,
|
1032
|
+
"p95_latency_ms": 0,
|
1033
|
+
"successful_requests": 0,
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
try:
|
1037
|
+
res = asyncio.run(_run_benchmark())
|
1038
|
+
finally:
|
1039
|
+
kill_process_tree(process.pid)
|
1040
|
+
|
1041
|
+
assert res["completed"] == res["successful_requests"]
|
1042
|
+
return res
|
1043
|
+
|
1044
|
+
|
845
1045
|
def run_bench_serving_multi(
|
846
1046
|
model,
|
847
1047
|
base_url,
|
@@ -1363,6 +1563,41 @@ async def send_concurrent_generate_requests(
|
|
1363
1563
|
return await asyncio.gather(*tasks)
|
1364
1564
|
|
1365
1565
|
|
1566
|
+
async def send_concurrent_generate_requests_with_custom_params(
|
1567
|
+
base_url: str,
|
1568
|
+
custom_params: List[dict[str, Any]],
|
1569
|
+
) -> Tuple[int, Any]:
|
1570
|
+
"""Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
|
1571
|
+
|
1572
|
+
base_payload = {
|
1573
|
+
"text": """
|
1574
|
+
System: You are a helpful assistant.
|
1575
|
+
User: What is the capital of France?
|
1576
|
+
Assistant: The capital of France is
|
1577
|
+
""",
|
1578
|
+
"sampling_params": {
|
1579
|
+
"temperature": 0,
|
1580
|
+
"max_new_tokens": 50,
|
1581
|
+
},
|
1582
|
+
}
|
1583
|
+
|
1584
|
+
async def async_generate_with_priority(req):
|
1585
|
+
async with aiohttp.ClientSession() as session:
|
1586
|
+
async with session.post(
|
1587
|
+
f"{base_url}/generate",
|
1588
|
+
json=req,
|
1589
|
+
) as response:
|
1590
|
+
resp_json = await response.json()
|
1591
|
+
return (response.status, resp_json)
|
1592
|
+
|
1593
|
+
tasks = []
|
1594
|
+
for c in custom_params:
|
1595
|
+
req = base_payload.copy()
|
1596
|
+
req.update(c)
|
1597
|
+
tasks.append(asyncio.create_task(async_generate_with_priority(req)))
|
1598
|
+
return await asyncio.gather(*tasks)
|
1599
|
+
|
1600
|
+
|
1366
1601
|
class CustomTestCase(unittest.TestCase):
|
1367
1602
|
def _callTestMethod(self, method):
|
1368
1603
|
max_retry = int(
|
@@ -1404,3 +1639,146 @@ def dump_bench_raw_result(
|
|
1404
1639
|
def _ensure_remove_suffix(text: str, suffix: str):
|
1405
1640
|
assert text.endswith(suffix)
|
1406
1641
|
return text.removesuffix(suffix)
|
1642
|
+
|
1643
|
+
|
1644
|
+
class ModelDeploySetup:
|
1645
|
+
def __init__(self, model_path: str, extra_args: List[str] = []):
|
1646
|
+
self.model_path = model_path
|
1647
|
+
if "--enable-multimodal" not in extra_args:
|
1648
|
+
extra_args.append("--enable-multimodal")
|
1649
|
+
if "--trust-remote-code" not in extra_args:
|
1650
|
+
extra_args.append("--trust-remote-code")
|
1651
|
+
|
1652
|
+
self.extra_args = extra_args
|
1653
|
+
|
1654
|
+
|
1655
|
+
class ModelEvalMetrics:
|
1656
|
+
def __init__(self, accuracy: float, eval_time: float):
|
1657
|
+
self.accuracy = accuracy
|
1658
|
+
self.eval_time = eval_time
|
1659
|
+
|
1660
|
+
|
1661
|
+
def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
|
1662
|
+
match = re.search(r"\[Profile\]\((.*?)\)", output)
|
1663
|
+
if match:
|
1664
|
+
trace_link = match.group(1)
|
1665
|
+
return trace_link
|
1666
|
+
return None
|
1667
|
+
|
1668
|
+
|
1669
|
+
def parse_models(model_string: str):
|
1670
|
+
return [model.strip() for model in model_string.split(",") if model.strip()]
|
1671
|
+
|
1672
|
+
|
1673
|
+
def check_evaluation_test_results(
|
1674
|
+
results,
|
1675
|
+
test_name,
|
1676
|
+
model_accuracy_thresholds,
|
1677
|
+
model_latency_thresholds=None,
|
1678
|
+
model_count=None,
|
1679
|
+
):
|
1680
|
+
"""
|
1681
|
+
results: list of tuple of (model_path, accuracy, latency)
|
1682
|
+
"""
|
1683
|
+
failed_models = []
|
1684
|
+
if model_latency_thresholds is not None:
|
1685
|
+
summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
|
1686
|
+
summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
|
1687
|
+
else:
|
1688
|
+
summary = " | model | status | score | score_threshold | \n"
|
1689
|
+
summary += "| ----- | ------ | ----- | --------------- | \n"
|
1690
|
+
|
1691
|
+
results_dict = {res[0]: (res[1], res[2]) for res in results}
|
1692
|
+
|
1693
|
+
for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
|
1694
|
+
latency_threshold = (
|
1695
|
+
model_latency_thresholds.get(model)
|
1696
|
+
if model_latency_thresholds is not None
|
1697
|
+
else 1e9
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
if model in results_dict:
|
1701
|
+
accuracy, latency = results_dict[model]
|
1702
|
+
is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
|
1703
|
+
status_emoji = "✅" if is_success else "❌"
|
1704
|
+
|
1705
|
+
if not is_success:
|
1706
|
+
if accuracy < accuracy_threshold:
|
1707
|
+
failed_models.append(
|
1708
|
+
f"\nScore Check Failed: {model}\n"
|
1709
|
+
f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
|
1710
|
+
)
|
1711
|
+
if latency > latency_threshold:
|
1712
|
+
failed_models.append(
|
1713
|
+
f"\nLatency Check Failed: {model}\n"
|
1714
|
+
f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
|
1715
|
+
)
|
1716
|
+
|
1717
|
+
if model_latency_thresholds is not None:
|
1718
|
+
line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
|
1719
|
+
else:
|
1720
|
+
line = (
|
1721
|
+
f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
|
1722
|
+
)
|
1723
|
+
else:
|
1724
|
+
status_emoji = "❌"
|
1725
|
+
failed_models.append(f"Model failed to launch or be evaluated: {model}")
|
1726
|
+
if model_latency_thresholds is not None:
|
1727
|
+
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
|
1728
|
+
else:
|
1729
|
+
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
|
1730
|
+
|
1731
|
+
summary += line
|
1732
|
+
|
1733
|
+
print(summary)
|
1734
|
+
|
1735
|
+
if is_in_ci():
|
1736
|
+
write_github_step_summary(f"## {test_name}\n{summary}")
|
1737
|
+
|
1738
|
+
if failed_models:
|
1739
|
+
print("Some models failed the evaluation.")
|
1740
|
+
raise AssertionError("\n".join(failed_models))
|
1741
|
+
|
1742
|
+
|
1743
|
+
# Bench knobs for bench_one_batch_server (override by env)
|
1744
|
+
def _parse_int_list_env(name: str, default_val: str):
|
1745
|
+
val = os.environ.get(name, default_val)
|
1746
|
+
return [int(x) for x in val.split(",") if x]
|
1747
|
+
|
1748
|
+
|
1749
|
+
# Return filenames
|
1750
|
+
def find_traces_under_path(path: str) -> List[str]:
|
1751
|
+
results = []
|
1752
|
+
for _, dirs, files in os.walk(path):
|
1753
|
+
for file in files:
|
1754
|
+
if file.endswith(".trace.json.gz"):
|
1755
|
+
results.append(f"{file}")
|
1756
|
+
return results
|
1757
|
+
|
1758
|
+
|
1759
|
+
def write_results_to_json(model, metrics, mode="a"):
|
1760
|
+
result = {
|
1761
|
+
"timestamp": datetime.now().isoformat(),
|
1762
|
+
"model": model,
|
1763
|
+
"metrics": metrics,
|
1764
|
+
"score": metrics["score"],
|
1765
|
+
}
|
1766
|
+
|
1767
|
+
if "latency" in metrics:
|
1768
|
+
result["latency"] = (metrics.get("latency"),)
|
1769
|
+
|
1770
|
+
existing_results = []
|
1771
|
+
if mode == "a" and os.path.exists("results.json"):
|
1772
|
+
try:
|
1773
|
+
with open("results.json", "r") as f:
|
1774
|
+
existing_results = json.load(f)
|
1775
|
+
except json.JSONDecodeError:
|
1776
|
+
existing_results = []
|
1777
|
+
|
1778
|
+
if isinstance(existing_results, list):
|
1779
|
+
existing_results.append(result)
|
1780
|
+
else:
|
1781
|
+
existing_results = [result]
|
1782
|
+
|
1783
|
+
with open("results.json", "w") as f:
|
1784
|
+
json.dump(existing_results, f, indent=2)
|
sglang/utils.py
CHANGED
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import random
|
8
8
|
import socket
|
9
|
+
import ssl
|
9
10
|
import subprocess
|
10
11
|
import sys
|
11
12
|
import time
|
@@ -155,7 +156,15 @@ def http_request(
|
|
155
156
|
data = bytes(dumps(json), encoding="utf-8")
|
156
157
|
|
157
158
|
try:
|
158
|
-
|
159
|
+
if sys.version_info >= (3, 13):
|
160
|
+
# Python 3.13+: Use SSL context (cafile removed)
|
161
|
+
if verify and isinstance(verify, str):
|
162
|
+
context = ssl.create_default_context(cafile=verify)
|
163
|
+
else:
|
164
|
+
context = ssl.create_default_context()
|
165
|
+
resp = urllib.request.urlopen(req, data=data, context=context)
|
166
|
+
else:
|
167
|
+
resp = urllib.request.urlopen(req, data=data, cafile=verify)
|
159
168
|
return HttpResponse(resp)
|
160
169
|
except urllib.error.HTTPError as e:
|
161
170
|
return HttpResponse(e)
|
@@ -472,11 +481,22 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
472
481
|
class TypeBasedDispatcher:
|
473
482
|
def __init__(self, mapping: List[Tuple[Type, Callable]]):
|
474
483
|
self._mapping = mapping
|
484
|
+
self._fallback_fn = None
|
485
|
+
|
486
|
+
def add_fallback_fn(self, fallback_fn: Callable):
|
487
|
+
self._fallback_fn = fallback_fn
|
488
|
+
|
489
|
+
def __iadd__(self, other: "TypeBasedDispatcher"):
|
490
|
+
self._mapping.extend(other._mapping)
|
491
|
+
return self
|
475
492
|
|
476
493
|
def __call__(self, obj: Any):
|
477
494
|
for ty, fn in self._mapping:
|
478
495
|
if isinstance(obj, ty):
|
479
496
|
return fn(obj)
|
497
|
+
|
498
|
+
if self._fallback_fn is not None:
|
499
|
+
return self._fallback_fn(obj)
|
480
500
|
raise ValueError(f"Invalid object: {obj}")
|
481
501
|
|
482
502
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.3rc2"
|