PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (408) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +330 -156
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +8 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +134 -23
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +70 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +66 -66
sglang/srt/entrypoints/grpc_server.py +431 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +120 -8
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +42 -4
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +18 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +4 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +12 -8
sglang/srt/layers/attention/flashinfer_backend.py +248 -21
sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +11 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +45 -15
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +147 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
sglang/srt/layers/moe/ep_moe/layer.py +119 -397
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +17 -1
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +84 -18
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +5 -30
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +673 -16
sglang/srt/layers/sampler.py +36 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +66 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +399 -499
sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +378 -90
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +4 -1
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +435 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +28 -23
sglang/srt/model_executor/model_runner.py +379 -139
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +273 -98
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +14 -37
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +5 -5
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3_moe.py +19 -35
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +6 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +577 -73
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +38 -28
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +192 -47
sglang/srt/utils/hf_transformers_utils.py +40 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +232 -99
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/test/runners.py CHANGED Viewed

@@ -519,6 +519,7 @@ class SRTRunner:
         lora_target_modules: Optional[List[str]] = None,
         enable_lora: Optional[bool] = None,
         max_loaded_loras: Optional[int] = None,
+        lora_eviction_policy: str = "lru",
     ):
         self.model_type = model_type
         self.is_generation = model_type == "generation"
@@ -565,6 +566,7 @@ class SRTRunner:
             lora_target_modules=lora_target_modules,
             enable_lora=enable_lora,
             max_loaded_loras=max_loaded_loras,
+            lora_eviction_policy=lora_eviction_policy,
             **spec_kwargs,
         )

sglang/test/send_one.py CHANGED Viewed

@@ -3,6 +3,8 @@ Run one test prompt.
 Usage:
 python3 -m sglang.test.send_one
+python3 -m sglang.test.send_one --profile --profile-steps 5
+python3 -m sglang.test.send_one --profile --profile-by-stage
 """
 import argparse
@@ -10,6 +12,9 @@ import dataclasses
 import json
 import requests
+import tabulate
+from sglang.profiler import run_profile
 @dataclasses.dataclass
@@ -29,6 +34,9 @@ class BenchArgs:
     image: bool = False
     many_images: bool = False
     stream: bool = False
+    profile: bool = False
+    profile_steps: int = 3
+    profile_by_stage: bool = False
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -51,6 +59,11 @@ class BenchArgs:
         parser.add_argument("--image", action="store_true")
         parser.add_argument("--many-images", action="store_true")
         parser.add_argument("--stream", action="store_true")
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument(
+            "--profile-steps", type=int, default=BenchArgs.profile_steps
+        )
+        parser.add_argument("--profile-by-stage", action="store_true")
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -59,6 +72,8 @@ class BenchArgs:
 def send_one_prompt(args):
+    base_url = f"http://{args.host}:{args.port}"
     if args.image:
         args.prompt = (
             "Human: Describe this image in a very short sentence.\n\nAssistant:"
@@ -108,19 +123,35 @@ def send_one_prompt(args):
         "stream": args.stream,
     }
+    # Run profiler if requested
+    if args.profile:
+        print(f"Running profiler with {args.profile_steps} steps...")
+        run_profile(
+            base_url,
+            args.profile_steps,
+            ["CPU", "GPU"],
+            None,
+            None,
+            args.profile_by_stage,
+        )
     response = requests.post(
-        f"http://{args.host}:{args.port}/generate",
+        f"{base_url}/generate",
         json=json_data,
         stream=args.stream,
     )
     if args.stream:
+        last_len = 0
         for chunk in response.iter_lines(decode_unicode=False):
             chunk = chunk.decode("utf-8")
             if chunk and chunk.startswith("data:"):
                 if chunk == "data: [DONE]":
                     break
                 ret = json.loads(chunk[5:].strip("\n"))
+                chunk_str = ret["text"][last_len:]
+                last_len = len(ret["text"])
+                print(chunk_str, end="", flush=True)
     else:
         ret = response.json()
@@ -131,21 +162,25 @@ def send_one_prompt(args):
         print(ret)
         return 0, 0
-    latency = ret["meta_info"]["e2e_latency"]
-    if "spec_verify_ct" in ret["meta_info"]:
+    if "spec_verify_ct" in ret["meta_info"] and ret["meta_info"]["spec_verify_ct"] > 0:
         acc_length = (
             ret["meta_info"]["completion_tokens"] / ret["meta_info"]["spec_verify_ct"]
         )
     else:
         acc_length = 1.0
+    latency = ret["meta_info"]["e2e_latency"]
     speed = ret["meta_info"]["completion_tokens"] / latency
+    tokens = ret["meta_info"]["completion_tokens"]
+    if not args.stream:
+        print(ret["text"])
-    print(ret["text"])
     print()
-    print(f"{acc_length=:.2f}")
-    print(f"{speed=:.2f} token/s")
+    headers = ["Latency (s)", "Tokens", "Acc Length", "Speed (token/s)"]
+    rows = [[f"{latency:.3f}", f"{tokens}", f"{acc_length:.3f}", f"{speed:.2f}"]]
+    msg = tabulate.tabulate(rows, headers=headers, tablefmt="pretty")
+    print(msg)
     return acc_length, speed

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -290,6 +290,9 @@ def aggregate_results(
     htmls = []
     convos = []
     for single_eval_result in single_eval_results:
+        # Skip None results
+        if single_eval_result is None:
+            continue
         for name, value in single_eval_result.metrics.items():
             name2values[name].append(value)
         if single_eval_result.score is not None:

sglang/test/simple_eval_gpqa.py CHANGED Viewed

@@ -18,7 +18,6 @@ from sglang.test.simple_eval_common import (
     HTML_JINJA,
     Eval,
     EvalResult,
-    MessageList,
     SamplerBase,
     SingleEvalResult,
     format_multichoice_question,

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -11,8 +11,6 @@ import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Dict, List, Optional
-import tqdm
 try:
     from human_eval.data import read_problems
     from human_eval.evaluation import estimate_pass_at_k
@@ -41,7 +39,6 @@ def evaluate_functional_correctness(
     Evaluates the functional correctness of generated samples, and writes
     results to f"{sample_file}_results.jsonl.gz"
     """
-    import copy
     # Check the generated samples against test suites.
     with ThreadPoolExecutor(max_workers=n_workers) as executor:

sglang/test/simple_eval_longbench_v2.py ADDED Viewed

@@ -0,0 +1,344 @@
+# Adapted from https://github.com/openai/simple-evals/
+"""
+LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-Context Multitasks
+Yushi Bai, Shangqing Tu, Jiajie Zhang, Hao Peng, Xiaozhi Wang, Xin Lv, Shulin Cao, Jiazheng Xu, Lei Hou, Yuxiao Dong, Jie Tang, Juanzi Li
+https://arxiv.org/abs/2412.15204
+"""
+import csv
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional
+from transformers import AutoTokenizer
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    ANSWER_PATTERN_MULTICHOICE,
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+)
+# LongBench-v2 task categories
+TASK_CATEGORIES = {
+    "single_document_qa",
+    "multi_document_qa",
+    "long_in_context_learning",
+    "long_dialogue_history",
+    "code_repo_understanding",
+    "long_structured_data",
+}
+DEFAULT_DATASET = "THUDM/LongBench-v2"
+DEFAULT_DATASET_SPLIT = "train"
+def format_longbench_v2_question(row: dict) -> str:
+    """Format a LongBench-v2 question using the official template."""
+    context = row.get("context", "")
+    question = row.get("question", "")
+    # Handle both standard format (A, B, C, D) and alternative format (choices list)
+    if "choices" in row:
+        choices = row["choices"]
+        choice_A = choices[0] if len(choices) > 0 else ""
+        choice_B = choices[1] if len(choices) > 1 else ""
+        choice_C = choices[2] if len(choices) > 2 else ""
+        choice_D = choices[3] if len(choices) > 3 else ""
+    else:
+        choice_A = row.get("A", row.get("choice_A", ""))
+        choice_B = row.get("B", row.get("choice_B", ""))
+        choice_C = row.get("C", row.get("choice_C", ""))
+        choice_D = row.get("D", row.get("choice_D", ""))
+    # Official LongBench-v2 template
+    prompt = f"""
+Please read the following text and answer the question below.
+<text>
+{context.strip()}
+</text>
+What is the correct answer to this question: {question.strip()}
+Choices:
+(A) {choice_A.strip()}
+(B) {choice_B.strip()}
+(C) {choice_C.strip()}
+(D) {choice_D.strip()}
+Format your response as follows: "The correct answer is (insert answer here)"."""
+    return prompt
+def extract_longbench_v2_answer(response: str) -> Optional[str]:
+    """Extract answer from model response using official LongBench-v2 method."""
+    response = response.replace("*", "")
+    # First try: "The correct answer is (A)"
+    match = re.search(r"The correct answer is \(([A-D])\)", response, re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    # Second try: "The correct answer is A"
+    match = re.search(r"The correct answer is ([A-D])", response, re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    # Fallback: Standard SGLang multichoice pattern
+    match = re.search(ANSWER_PATTERN_MULTICHOICE, response)
+    if match:
+        return match.group(1).upper()
+    # Generic fallback when model says "answer is A"
+    match = re.search(r"answer\s+is\s*\(?([A-D])\)?", response, re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    return None
+class LongBenchV2Eval(Eval):
+    """
+    Evaluation utility for LongBench-v2 dataset.
+    LongBench-v2 is designed to assess the ability of LLMs to handle long-context problems
+    requiring deep understanding and reasoning across real-world multitasks.
+    """
+    def __init__(
+        self,
+        model: str = None,
+        data_source: str = DEFAULT_DATASET,
+        num_examples: Optional[int] = None,
+        num_threads: int = 1,
+        n_repeats: int = 1,
+        categories: Optional[List[str]] = None,
+        max_context_length: Optional[int] = None,
+        min_context_length: Optional[int] = None,
+    ):
+        """
+        Initialize LongBench-v2 evaluation.
+        Args:
+            data_source: HuggingFace dataset name, local file path (CSV/JSON)
+            num_examples: Number of examples to evaluate (None for all)
+            num_threads: Number of threads for parallel processing
+            n_repeats: Number of times to repeat evaluation for error bars
+            categories: List of task categories to include (None for all)
+            max_context_length: Maximum context length in characters
+            min_context_length: Minimum context length in characters
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+        self.min_context_length = min_context_length
+        self.max_context_length = max_context_length
+        # Load dataset based on data source type
+        examples = self._load_dataset(data_source)
+        # Apply filtering
+        if categories:
+            examples = [ex for ex in examples if ex.get("category") in categories]
+        # Sample examples if specified
+        if num_examples:
+            assert n_repeats == 1, "n_repeats only supported when not sampling examples"
+            examples = examples[: min(num_examples, len(examples))]
+        # Repeat examples for multiple runs
+        examples = examples * n_repeats
+        if not examples:
+            raise ValueError(
+                "No examples available for LongBench-v2 evaluation after filtering"
+            )
+        self.examples = examples
+        self.n_repeats = n_repeats
+        self.num_threads = num_threads
+        print(f"Loaded {len(self.examples)} examples from LongBench-v2")
+        if categories:
+            print(f"Filtered to categories: {categories}")
+        if min_context_length or max_context_length:
+            print(
+                f"Context length filter: {min_context_length}-{max_context_length} characters"
+            )
+    def _load_dataset(self, data_source: str) -> List[Dict[str, Any]]:
+        """Load dataset from HuggingFace hub or local files."""
+        if not data_source:
+            data_source = DEFAULT_DATASET
+        if os.path.exists(data_source):
+            raw_examples = self._load_local_file(data_source)
+        else:
+            raw_examples = self._load_hf_dataset(data_source)
+        return [self._normalize_example(example) for example in raw_examples]
+    def _load_local_file(self, path: str) -> List[Dict[str, Any]]:
+        """Load examples from a local CSV/JSON/JSONL file."""
+        suffix = os.path.splitext(path)[1].lower()
+        if suffix in {".json", ".jsonl"}:
+            with open(path, "r", encoding="utf-8") as fh:
+                if suffix == ".jsonl":
+                    data = [json.loads(line) for line in fh if line.strip()]
+                else:
+                    data = json.load(fh)
+        elif suffix == ".csv":
+            with open(path, "r", encoding="utf-8") as fh:
+                reader = csv.DictReader(fh)
+                data = list(reader)
+        else:
+            # Try JSON, then CSV as fallback
+            try:
+                with open(path, "r", encoding="utf-8") as fh:
+                    data = json.load(fh)
+            except json.JSONDecodeError:
+                with open(path, "r", encoding="utf-8") as fh:
+                    reader = csv.DictReader(fh)
+                    data = list(reader)
+        if isinstance(data, dict):
+            data = data.get("data", [])
+        if not isinstance(data, list):
+            raise ValueError("Expected list of examples from local file")
+        return data
+    def _load_hf_dataset(self, identifier: str) -> List[Dict[str, Any]]:
+        """Load the dataset from HuggingFace Hub."""
+        parts = identifier.split(":", maxsplit=1)
+        dataset_name = parts[0]
+        split = parts[1] if len(parts) == 2 else DEFAULT_DATASET_SPLIT
+        try:
+            from datasets import load_dataset  # type: ignore
+        except ImportError as exc:
+            raise ImportError(
+                "Please install the 'datasets' package to load LongBench-v2 from HuggingFace: pip install datasets"
+            ) from exc
+        dataset = load_dataset(dataset_name, split=split)
+        return [dict(row) for row in dataset]
+    def _normalize_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Ensure each example exposes the expected keys."""
+        normalized = dict(example)
+        for letter in ["A", "B", "C", "D"]:
+            choice_key = f"choice_{letter}"
+            if letter not in normalized and choice_key in normalized:
+                normalized[letter] = normalized[choice_key]
+        if "category" not in normalized and "domain" in normalized:
+            normalized["category"] = normalized["domain"]
+        answer = normalized.get("answer")
+        if isinstance(answer, str):
+            normalized["answer"] = answer.strip().upper()
+        elif isinstance(answer, int) and 0 <= answer < 4:
+            normalized["answer"] = ["A", "B", "C", "D"][answer]
+        return normalized
+    def _check_context_length(
+        self,
+        formatted_question: str,
+        tokenizer: AutoTokenizer,
+        min_length: Optional[int],
+        max_length: Optional[int],
+    ) -> bool:
+        """Filter examples by context length measured in characters."""
+        input_ids = tokenizer.encode(formatted_question)
+        context_length = len(input_ids)
+        if min_length is not None and context_length < min_length:
+            return False
+        if max_length is not None and context_length > max_length:
+            return False
+        return True
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        """Run the evaluation."""
+        def fn(row: dict):
+            # Format the question using official template
+            formatted_question = format_longbench_v2_question(row)
+            if self.min_context_length or self.max_context_length:
+                if not self._check_context_length(
+                    formatted_question,
+                    self.tokenizer,
+                    self.min_context_length,
+                    self.max_context_length,
+                ):
+                    # Skip this example
+                    return None
+            prompt_messages = [
+                sampler._pack_message(content=formatted_question, role="user")
+            ]
+            # Get model response
+            response_text = sampler(prompt_messages)
+            if response_text is None:
+                response_text = ""
+            # Extract answer using official method
+            extracted_answer = extract_longbench_v2_answer(response_text)
+            # Get correct answer
+            correct_answer = row.get("answer", "")
+            if isinstance(correct_answer, str):
+                correct_answer = correct_answer.strip().upper()
+            elif isinstance(correct_answer, int) and 0 <= correct_answer < 4:
+                correct_answer = ["A", "B", "C", "D"][correct_answer]
+            # Calculate score
+            score = 1.0 if extracted_answer == correct_answer else 0.0
+            # Generate HTML report
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=correct_answer,
+                extracted_answer=extracted_answer,
+            )
+            # Build conversation
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            # Prepare metrics
+            metrics = {"chars": len(response_text)}
+            # Add category-specific metrics
+            category = row.get("category", row.get("domain", "unknown"))
+            if category in TASK_CATEGORIES:
+                metrics[category] = score
+            difficulty = row.get("difficulty")
+            if isinstance(difficulty, str) and difficulty:
+                metrics[f"difficulty_{difficulty.lower()}"] = score
+            return SingleEvalResult(
+                html=html,
+                score=score,
+                convo=convo,
+                metrics=metrics,
+            )
+        # Run evaluation with progress tracking
+        results = common.map_with_progress(fn, self.examples, self.num_threads)
+        return common.aggregate_results(results)

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import itertools
-import os
 import unittest
 import torch
@@ -577,7 +576,7 @@ class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase):
         if not torch.cuda.is_available():
             raise unittest.SkipTest("CUDA is not available")
         try:
-            import deep_gemm
+            import deep_gemm  # noqa: F401
         except ImportError:
             raise unittest.SkipTest("DeepGEMM is not available")
         torch.set_default_device("cuda")

sglang/test/test_block_fp8_deep_gemm_blackwell.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import itertools
-import os
 import unittest
 from typing import List, Tuple

sglang/test/test_cutlass_moe.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import argparse
-import time
 import torch
 import triton  # Added import
@@ -34,7 +33,7 @@ def get_model_config(tp_size: int):
         "topk": topk,
         "hidden_size": config.hidden_size,
         "shard_intermediate_size": shard_intermediate_size,
-        "dtype": config.torch_dtype,
+        "dtype": config.dtype,
         "block_shape": config.quantization_config["weight_block_size"],
     }

sglang/test/test_cutlass_w4a8_moe.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Literal, Optional
+from typing import Optional
 import pytest
 import torch
@@ -120,7 +120,7 @@ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dty
     )
     topk_weights, topk_ids, _ = topk_output
     expert_map = torch.arange(E, dtype=torch.int32, device=device)
-    expert_map[local_e:] = E
+    expert_map[local_e:] = -1
     output = cutlass_moe(
         a,
@@ -138,9 +138,7 @@ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dty
         c_strides2,
         s_strides13,
         s_strides2,
-        0,
-        local_e - 1,
-        E,
+        local_e,
         a1_scale,
         a2_scale,
         expert_map,
@@ -178,7 +176,7 @@ def cutlass_moe(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
     topk_weights: torch.Tensor,
-    topk_ids_: torch.Tensor,
+    topk_ids: torch.Tensor,
     a_strides1: torch.Tensor,
     b_strides1: torch.Tensor,
     c_strides1: torch.Tensor,
@@ -187,40 +185,32 @@ def cutlass_moe(
     c_strides2: torch.Tensor,
     s_strides13: torch.Tensor,
     s_strides2: torch.Tensor,
-    start_expert_id: int,
-    end_expert_id: int,
-    E: int,
+    num_local_experts: int,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
 ):
-    local_topk_ids = topk_ids_
-    local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
+    topk_ids = expert_map[topk_ids]
     device = a.device
-    local_num_experts = end_expert_id - start_expert_id + 1
     expert_offsets = torch.empty(
-        (local_num_experts + 1), dtype=torch.int32, device=device
+        (num_local_experts + 1), dtype=torch.int32, device=device
     )
     problem_sizes1 = torch.empty(
-        (local_num_experts, 3), dtype=torch.int32, device=device
+        (num_local_experts, 3), dtype=torch.int32, device=device
     )
     problem_sizes2 = torch.empty(
-        (local_num_experts, 3), dtype=torch.int32, device=device
+        (num_local_experts, 3), dtype=torch.int32, device=device
     )
     return cutlass_w4a8_moe(
-        start_expert_id,
-        end_expert_id,
-        E,
         a,
         w1_q,
         w2_q,
         w1_scale,
         w2_scale,
         topk_weights,
-        topk_ids_,
-        local_topk_ids,
+        topk_ids,
         a_strides1,
         b_strides1,
         c_strides1,

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4py3-none-any.whl