sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
MANY_NEW_TOKENS_PROMPT = """
|
|
6
|
+
Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
|
|
7
|
+
Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
|
|
8
|
+
Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
|
|
9
|
+
The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MatchedStopMixin:
|
|
14
|
+
def _run_completions_generation(
|
|
15
|
+
self,
|
|
16
|
+
prompt=MANY_NEW_TOKENS_PROMPT,
|
|
17
|
+
max_tokens=1,
|
|
18
|
+
stop=None,
|
|
19
|
+
stop_regex=None,
|
|
20
|
+
finish_reason=None,
|
|
21
|
+
matched_stop=None,
|
|
22
|
+
):
|
|
23
|
+
payload = {
|
|
24
|
+
"prompt": prompt,
|
|
25
|
+
"model": self.model,
|
|
26
|
+
"temperature": 0,
|
|
27
|
+
"top_p": 1,
|
|
28
|
+
"max_tokens": max_tokens,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if stop is not None:
|
|
32
|
+
payload["stop"] = stop
|
|
33
|
+
|
|
34
|
+
if stop_regex is not None:
|
|
35
|
+
payload["stop_regex"] = stop_regex
|
|
36
|
+
|
|
37
|
+
response_completions = requests.post(
|
|
38
|
+
self.base_url + "/v1/completions",
|
|
39
|
+
json=payload,
|
|
40
|
+
)
|
|
41
|
+
res = response_completions.json()
|
|
42
|
+
print(json.dumps(res))
|
|
43
|
+
print("=" * 100)
|
|
44
|
+
|
|
45
|
+
if not isinstance(matched_stop, list):
|
|
46
|
+
matched_stop = [matched_stop]
|
|
47
|
+
|
|
48
|
+
assert (
|
|
49
|
+
res["choices"][0]["finish_reason"] == finish_reason
|
|
50
|
+
), f"Expected finish_reason: {finish_reason}, but got: {res['choices'][0]['finish_reason']}"
|
|
51
|
+
assert (
|
|
52
|
+
res["choices"][0]["matched_stop"] in matched_stop
|
|
53
|
+
), f"Expected matched_stop: {matched_stop}, but got: {res['choices'][0]['matched_stop']}"
|
|
54
|
+
|
|
55
|
+
def _run_chat_completions_generation(
|
|
56
|
+
self,
|
|
57
|
+
prompt=MANY_NEW_TOKENS_PROMPT,
|
|
58
|
+
max_tokens=1,
|
|
59
|
+
stop=None,
|
|
60
|
+
stop_regex=None,
|
|
61
|
+
finish_reason=None,
|
|
62
|
+
matched_stop=None,
|
|
63
|
+
):
|
|
64
|
+
chat_payload = {
|
|
65
|
+
"model": self.model,
|
|
66
|
+
"messages": [
|
|
67
|
+
{"role": "system", "content": "You are a helpful AI assistant"},
|
|
68
|
+
{"role": "user", "content": prompt},
|
|
69
|
+
],
|
|
70
|
+
"temperature": 0,
|
|
71
|
+
"top_p": 1,
|
|
72
|
+
"max_tokens": max_tokens,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if stop is not None:
|
|
76
|
+
chat_payload["stop"] = stop
|
|
77
|
+
|
|
78
|
+
if stop_regex is not None:
|
|
79
|
+
chat_payload["stop_regex"] = stop_regex
|
|
80
|
+
|
|
81
|
+
response_chat = requests.post(
|
|
82
|
+
self.base_url + "/v1/chat/completions",
|
|
83
|
+
json=chat_payload,
|
|
84
|
+
)
|
|
85
|
+
res = response_chat.json()
|
|
86
|
+
print(json.dumps(res))
|
|
87
|
+
print("=" * 100)
|
|
88
|
+
|
|
89
|
+
if not isinstance(matched_stop, list):
|
|
90
|
+
matched_stop = [matched_stop]
|
|
91
|
+
|
|
92
|
+
assert (
|
|
93
|
+
res["choices"][0]["finish_reason"] == finish_reason
|
|
94
|
+
), f"Expected finish_reason: {finish_reason}, but got: {res['choices'][0]['finish_reason']}"
|
|
95
|
+
assert (
|
|
96
|
+
res["choices"][0]["matched_stop"] in matched_stop
|
|
97
|
+
), f"Expected matched_stop: {matched_stop}, but got: {res['choices'][0]['matched_stop']}"
|
|
98
|
+
|
|
99
|
+
def test_finish_stop_str(self):
|
|
100
|
+
self._run_completions_generation(
|
|
101
|
+
max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
|
|
102
|
+
)
|
|
103
|
+
self._run_chat_completions_generation(
|
|
104
|
+
max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def test_finish_stop_regex_str(self):
|
|
108
|
+
STOP_REGEX_STR = r"and|or"
|
|
109
|
+
self._run_completions_generation(
|
|
110
|
+
max_tokens=1000,
|
|
111
|
+
stop_regex=STOP_REGEX_STR,
|
|
112
|
+
finish_reason="stop",
|
|
113
|
+
matched_stop=STOP_REGEX_STR,
|
|
114
|
+
)
|
|
115
|
+
self._run_chat_completions_generation(
|
|
116
|
+
max_tokens=1000,
|
|
117
|
+
stop_regex=STOP_REGEX_STR,
|
|
118
|
+
finish_reason="stop",
|
|
119
|
+
matched_stop=STOP_REGEX_STR,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Match a complete sentence
|
|
123
|
+
STOP_REGEX_STR_SENTENCE = r"[.!?]\s*$"
|
|
124
|
+
self._run_chat_completions_generation(
|
|
125
|
+
max_tokens=1000,
|
|
126
|
+
stop_regex=STOP_REGEX_STR_SENTENCE,
|
|
127
|
+
finish_reason="stop",
|
|
128
|
+
matched_stop=STOP_REGEX_STR_SENTENCE,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def test_finish_stop_eos(self):
|
|
132
|
+
llama_format_prompt = """\
|
|
133
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
134
|
+
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
|
|
135
|
+
What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
|
136
|
+
"""
|
|
137
|
+
eos_token_ids = [128000, 128009, 2]
|
|
138
|
+
self._run_completions_generation(
|
|
139
|
+
prompt=llama_format_prompt,
|
|
140
|
+
max_tokens=1000,
|
|
141
|
+
finish_reason="stop",
|
|
142
|
+
matched_stop=eos_token_ids,
|
|
143
|
+
)
|
|
144
|
+
self._run_chat_completions_generation(
|
|
145
|
+
prompt="What is 2 + 2?",
|
|
146
|
+
max_tokens=1000,
|
|
147
|
+
finish_reason="stop",
|
|
148
|
+
matched_stop=eos_token_ids,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def test_finish_length(self):
|
|
152
|
+
self._run_completions_generation(
|
|
153
|
+
max_tokens=5, finish_reason="length", matched_stop=None
|
|
154
|
+
)
|
|
155
|
+
self._run_chat_completions_generation(
|
|
156
|
+
max_tokens=5, finish_reason="length", matched_stop=None
|
|
157
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""LongBench-v2 auxiliary utilities and validation scripts."""
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test cases for LongBench-v2 evaluation utility.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import tempfile
|
|
8
|
+
|
|
9
|
+
from sglang.test.simple_eval_longbench_v2 import (
|
|
10
|
+
LongBenchV2Eval,
|
|
11
|
+
extract_longbench_v2_answer,
|
|
12
|
+
format_longbench_v2_question,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_format_longbench_v2_question():
|
|
17
|
+
"""Test the official LongBench-v2 question formatting."""
|
|
18
|
+
sample_row = {
|
|
19
|
+
"context": "This is a sample context about environmental issues.",
|
|
20
|
+
"question": "What is the main theme?",
|
|
21
|
+
"A": "Technology",
|
|
22
|
+
"B": "Environment",
|
|
23
|
+
"C": "Economics",
|
|
24
|
+
"D": "Politics",
|
|
25
|
+
"answer": "B",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
formatted = format_longbench_v2_question(sample_row)
|
|
29
|
+
|
|
30
|
+
# Verify official template structure
|
|
31
|
+
assert "This is a sample context about environmental issues." in formatted
|
|
32
|
+
assert (
|
|
33
|
+
"What is the correct answer to this question: What is the main theme?"
|
|
34
|
+
in formatted
|
|
35
|
+
)
|
|
36
|
+
assert "(A) Technology" in formatted
|
|
37
|
+
assert "(B) Environment" in formatted
|
|
38
|
+
assert "(C) Economics" in formatted
|
|
39
|
+
assert "(D) Politics" in formatted
|
|
40
|
+
assert "The correct answer is" in formatted
|
|
41
|
+
print("✓ Question formatting works correctly")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_extract_longbench_v2_answer():
|
|
45
|
+
"""Test the official LongBench-v2 answer extraction."""
|
|
46
|
+
|
|
47
|
+
# Test official format: "The correct answer is (A)"
|
|
48
|
+
response1 = "After analyzing the context, The correct answer is (B)."
|
|
49
|
+
assert extract_longbench_v2_answer(response1) == "B"
|
|
50
|
+
|
|
51
|
+
# Test alternative format: "The correct answer is A"
|
|
52
|
+
response2 = "Based on the evidence, The correct answer is C."
|
|
53
|
+
assert extract_longbench_v2_answer(response2) == "C"
|
|
54
|
+
|
|
55
|
+
# Test with asterisks
|
|
56
|
+
response3 = "*The correct answer is (D)*"
|
|
57
|
+
assert extract_longbench_v2_answer(response3) == "D"
|
|
58
|
+
|
|
59
|
+
# Test fallback to standard pattern
|
|
60
|
+
response4 = "I think the answer is A."
|
|
61
|
+
assert extract_longbench_v2_answer(response4) == "A"
|
|
62
|
+
|
|
63
|
+
# Test no answer
|
|
64
|
+
response5 = "I'm not sure about this."
|
|
65
|
+
assert extract_longbench_v2_answer(response5) is None
|
|
66
|
+
|
|
67
|
+
print("✓ Answer extraction works correctly")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_longbench_v2_eval_initialization():
|
|
71
|
+
"""Test LongBench-v2 evaluation class initialization."""
|
|
72
|
+
|
|
73
|
+
# Create a temporary JSON file with sample data
|
|
74
|
+
sample_data = [
|
|
75
|
+
{
|
|
76
|
+
"_id": "test_001",
|
|
77
|
+
"domain": "single_document_qa",
|
|
78
|
+
"question": "What is X?",
|
|
79
|
+
"choice_A": "Option A1",
|
|
80
|
+
"choice_B": "Option B1",
|
|
81
|
+
"choice_C": "Option C1",
|
|
82
|
+
"choice_D": "Option D1",
|
|
83
|
+
"answer": "A",
|
|
84
|
+
"context": "Context 1",
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
"_id": "test_002",
|
|
88
|
+
"domain": "multi_document_qa",
|
|
89
|
+
"question": "What is Y?",
|
|
90
|
+
"A": "Option A2",
|
|
91
|
+
"B": "Option B2",
|
|
92
|
+
"C": "Option C2",
|
|
93
|
+
"D": "Option D2",
|
|
94
|
+
"answer": "B",
|
|
95
|
+
"context": "Context 2",
|
|
96
|
+
},
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
100
|
+
json.dump(sample_data, f)
|
|
101
|
+
temp_file = f.name
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Test initialization with new data_source parameter
|
|
105
|
+
eval_instance = LongBenchV2Eval(data_source=temp_file, num_examples=1)
|
|
106
|
+
assert len(eval_instance.examples) == 1
|
|
107
|
+
first_example = eval_instance.examples[0]
|
|
108
|
+
assert first_example.get("category") in {
|
|
109
|
+
"single_document_qa",
|
|
110
|
+
"multi_document_qa",
|
|
111
|
+
}
|
|
112
|
+
assert first_example.get("A") in {"Option A1", "Option A2"}
|
|
113
|
+
print("✓ Evaluation class initialization works correctly")
|
|
114
|
+
|
|
115
|
+
finally:
|
|
116
|
+
os.unlink(temp_file)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_category_filtering():
|
|
120
|
+
"""Ensure category filtering keeps only requested domains."""
|
|
121
|
+
|
|
122
|
+
sample_data = [
|
|
123
|
+
{
|
|
124
|
+
"_id": "test_001",
|
|
125
|
+
"domain": "single_document_qa",
|
|
126
|
+
"question": "What is X?",
|
|
127
|
+
"choice_A": "Option A1",
|
|
128
|
+
"choice_B": "Option B1",
|
|
129
|
+
"choice_C": "Option C1",
|
|
130
|
+
"choice_D": "Option D1",
|
|
131
|
+
"answer": "A",
|
|
132
|
+
"context": "Context 1",
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"_id": "test_002",
|
|
136
|
+
"domain": "multi_document_qa",
|
|
137
|
+
"question": "What is Y?",
|
|
138
|
+
"choice_A": "Option A2",
|
|
139
|
+
"choice_B": "Option B2",
|
|
140
|
+
"choice_C": "Option C2",
|
|
141
|
+
"choice_D": "Option D2",
|
|
142
|
+
"answer": "B",
|
|
143
|
+
"context": "Context 2",
|
|
144
|
+
},
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
148
|
+
json.dump(sample_data, f)
|
|
149
|
+
temp_file = f.name
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
eval_instance = LongBenchV2Eval(
|
|
153
|
+
data_source=temp_file,
|
|
154
|
+
categories=["multi_document_qa"],
|
|
155
|
+
)
|
|
156
|
+
assert len(eval_instance.examples) == 1
|
|
157
|
+
assert eval_instance.examples[0]["category"] == "multi_document_qa"
|
|
158
|
+
print("✓ Category filtering works correctly")
|
|
159
|
+
finally:
|
|
160
|
+
os.unlink(temp_file)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_difficulty_metrics():
|
|
164
|
+
"""Validate that difficulty-specific metrics are recorded."""
|
|
165
|
+
|
|
166
|
+
sample_data = [
|
|
167
|
+
{
|
|
168
|
+
"_id": "easy_001",
|
|
169
|
+
"domain": "single_document_qa",
|
|
170
|
+
"difficulty": "easy",
|
|
171
|
+
"question": "Easy question?",
|
|
172
|
+
"choice_A": "Correct",
|
|
173
|
+
"choice_B": "Wrong",
|
|
174
|
+
"choice_C": "Wrong",
|
|
175
|
+
"choice_D": "Wrong",
|
|
176
|
+
"answer": "A",
|
|
177
|
+
"context": "Easy context",
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"_id": "hard_001",
|
|
181
|
+
"domain": "single_document_qa",
|
|
182
|
+
"difficulty": "hard",
|
|
183
|
+
"question": "Hard question?",
|
|
184
|
+
"choice_A": "Wrong",
|
|
185
|
+
"choice_B": "Correct",
|
|
186
|
+
"choice_C": "Wrong",
|
|
187
|
+
"choice_D": "Wrong",
|
|
188
|
+
"answer": "B",
|
|
189
|
+
"context": "Hard context",
|
|
190
|
+
},
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
194
|
+
json.dump(sample_data, f)
|
|
195
|
+
temp_file = f.name
|
|
196
|
+
|
|
197
|
+
class FixedSampler: # noqa: D401 - simple helper
|
|
198
|
+
"""Mock sampler returning the correct answer based on question text."""
|
|
199
|
+
|
|
200
|
+
def _pack_message(self, content: str, role: str):
|
|
201
|
+
return {"content": content, "role": role}
|
|
202
|
+
|
|
203
|
+
def __call__(self, messages):
|
|
204
|
+
prompt = messages[0]["content"]
|
|
205
|
+
if "Easy question" in prompt:
|
|
206
|
+
return "The correct answer is (A)"
|
|
207
|
+
return "The correct answer is (B)"
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
eval_instance = LongBenchV2Eval(data_source=temp_file, num_threads=1)
|
|
211
|
+
result = eval_instance(FixedSampler())
|
|
212
|
+
|
|
213
|
+
assert result.metrics.get("difficulty_easy") == 1.0
|
|
214
|
+
assert result.metrics.get("difficulty_hard") == 1.0
|
|
215
|
+
print("✓ Difficulty metrics recorded correctly")
|
|
216
|
+
finally:
|
|
217
|
+
os.unlink(temp_file)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def main():
|
|
221
|
+
"""Run all tests."""
|
|
222
|
+
print("Testing simplified LongBench-v2 evaluation utility...\n")
|
|
223
|
+
|
|
224
|
+
test_format_longbench_v2_question()
|
|
225
|
+
test_extract_longbench_v2_answer()
|
|
226
|
+
test_longbench_v2_eval_initialization()
|
|
227
|
+
test_category_filtering()
|
|
228
|
+
test_difficulty_metrics()
|
|
229
|
+
|
|
230
|
+
print("\n" + "=" * 50)
|
|
231
|
+
print("✅ ALL TESTS PASSED!")
|
|
232
|
+
print("The simplified implementation follows SGLang patterns")
|
|
233
|
+
print("while maintaining LongBench-v2 compatibility.")
|
|
234
|
+
print("=" * 50)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
if __name__ == "__main__":
|
|
238
|
+
main()
|