PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/test/test_deterministic.py ADDED Viewed

@@ -0,0 +1,430 @@
+"""
+Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.
+Usage:
+# Single mode: test determinism with varying batch sizes
+python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode single
+# Prefix mode: test with shared prefixes
+python3 -m sglang.test.test_deterministic --n-start 1 --n-trials 50 --test-mode prefix
+# Radix Cache Consistency mode: test radix cache determinism (cached vs uncached prefill)
+python3 -m sglang.test.test_deterministic --test-mode radix_cache
+"""
+import argparse
+import dataclasses
+import json
+import os
+import random
+from typing import List
+import requests
+from sglang.profiler import run_profile
+PROMPT_1 = "Tell me about Richard Feynman: "
+PROMPT_2 = "Generate 1000 random numbers. Go directly into it, don't say Sure and don't say here are numbers. Just start with a number."
+dirpath = os.path.dirname(__file__)
+with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
+    LONG_PROMPT = f.read()
+@dataclasses.dataclass
+class BenchArgs:
+    host: str = "localhost"
+    port: int = 30000
+    batch_size: int = 1
+    temperature: float = 0.0
+    sampling_seed: int = 42
+    max_new_tokens: int = 100
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    return_logprob: bool = False
+    stream: bool = False
+    profile: bool = False
+    profile_steps: int = 3
+    profile_by_stage: bool = False
+    test_mode: str = "single"
+    n_trials: int = 50
+    n_start: int = 1
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--host", type=str, default=BenchArgs.host)
+        parser.add_argument("--port", type=int, default=BenchArgs.port)
+        parser.add_argument("--n-trials", type=int, default=BenchArgs.n_trials)
+        parser.add_argument("--n-start", type=int, default=BenchArgs.n_start)
+        parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
+        parser.add_argument(
+            "--sampling-seed", type=int, default=BenchArgs.sampling_seed
+        )
+        parser.add_argument(
+            "--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
+        )
+        parser.add_argument(
+            "--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
+        )
+        parser.add_argument(
+            "--presence-penalty", type=float, default=BenchArgs.presence_penalty
+        )
+        parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument("--stream", action="store_true")
+        parser.add_argument(
+            "--test-mode",
+            type=str,
+            default=BenchArgs.test_mode,
+            choices=[
+                "single",
+                "prefix",
+                "radix_cache",
+            ],
+        )
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument(
+            "--profile-steps", type=int, default=BenchArgs.profile_steps
+        )
+        parser.add_argument("--profile-by-stage", action="store_true")
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+def send_single(
+    args,
+    batch_size: int = 1,
+    profile: bool = False,
+    profile_steps: int = 3,
+    profile_by_stage: bool = False,
+    return_full_response: bool = False,
+    input_ids: List[int] = None,
+    max_new_tokens: int = None,
+):
+    base_url = f"http://{args.host}:{args.port}"
+    # Use input_ids if provided, otherwise use text prompts
+    if input_ids is not None:
+        json_data = {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": args.temperature,
+                "max_new_tokens": (
+                    max_new_tokens
+                    if max_new_tokens is not None
+                    else args.max_new_tokens
+                ),
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+            },
+            "return_logprob": args.return_logprob,
+            "stream": args.stream,
+        }
+    else:
+        prompt = [PROMPT_1] * batch_size
+        json_data = {
+            "text": prompt,
+            "sampling_params": {
+                "temperature": args.temperature,
+                "max_new_tokens": (
+                    max_new_tokens
+                    if max_new_tokens is not None
+                    else args.max_new_tokens
+                ),
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+            },
+            "return_logprob": args.return_logprob,
+            "stream": args.stream,
+        }
+    if args.sampling_seed is not None:
+        # sglang server cannot parse None value for sampling_seed
+        json_data["sampling_params"]["sampling_seed"] = args.sampling_seed
+    if profile:
+        run_profile(
+            base_url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
+        )
+    response = requests.post(
+        f"{base_url}/generate",
+        json=json_data,
+        stream=args.stream,
+    )
+    if response.status_code != 200:
+        ret = response.json()
+        print(f"Error: {ret}")
+        return None
+    if args.stream:
+        for chunk in response.iter_lines(decode_unicode=False):
+            chunk = chunk.decode("utf-8")
+            if chunk and chunk.startswith("data:"):
+                if chunk == "data: [DONE]":
+                    break
+                ret = json.loads(chunk[5:].strip("\n"))
+    else:
+        ret = response.json()
+    ret = ret[0] if isinstance(ret, list) else ret
+    if return_full_response:
+        return ret
+    else:
+        return ret["text"]
+def send_prefix(args, batch_size: int, prompts: List[str]):
+    requests.post(f"http://{args.host}:{args.port}/flush_cache")
+    batch_data = []
+    sampled_indices = []
+    for _ in range(batch_size):
+        sampled_index = random.randint(0, len(prompts) - 1)
+        sampled_indices.append(sampled_index)
+        batch_data.append(prompts[sampled_index])
+    json_data = {
+        "text": batch_data,
+        "sampling_params": {
+            "temperature": args.temperature,
+            "max_new_tokens": args.max_new_tokens,
+            "frequency_penalty": args.frequency_penalty,
+            "presence_penalty": args.presence_penalty,
+        },
+        "return_logprob": args.return_logprob,
+        "stream": args.stream,
+    }
+    if args.sampling_seed is not None:
+        json_data["sampling_params"]["sampling_seed"] = args.sampling_seed
+    response = requests.post(
+        f"http://{args.host}:{args.port}/generate",
+        json=json_data,
+        stream=args.stream,
+    )
+    ret = response.json()
+    if response.status_code != 200:
+        print(ret)
+        return -1, -1, -1
+    ret_dict = {i: [] for i in range(len(prompts))}
+    for i in range(batch_size):
+        ret_dict[sampled_indices[i]].append(ret[i]["text"])
+    return ret_dict
+def test_deterministic(args):
+    if args.test_mode == "single":
+        # In single mode, we test the deterministic behavior by sending the same prompt in batch sizes ranging from 1 to n_trials.
+        texts = []
+        for i in range(1, args.n_trials + 1):
+            batch_size = i
+            text = send_single(args, batch_size, args.profile)
+            text = text.replace("\n", " ")
+            print(f"Trial {i} with batch size {batch_size}: {text}")
+            texts.append(text)
+        print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
+        return [len(set(texts))]
+    elif args.test_mode == "prefix":
+        # In prefix mode, we create prompts from the same long prompt, with different lengths of common prefix.
+        len_prefix = [1, 511, 2048, 4097]
+        num_prompts = len(len_prefix)
+        outputs = {i: [] for i in range(4)}
+        prompts = [LONG_PROMPT[: len_prefix[i]] for i in range(4)]
+        for i in range(args.n_start, args.n_start + args.n_trials):
+            batch_size = i
+            ret_dict = send_prefix(args, batch_size, prompts)
+            msg = f"Testing Trial {i} with batch size {batch_size},"
+            for i in range(num_prompts):
+                msg += f" # prefix length {len_prefix[i]}: {len(ret_dict[i])},"
+            print(msg)
+            for i in range(num_prompts):
+                outputs[i].extend(ret_dict[i])
+        for i in range(num_prompts):
+            print(
+                f"Prompt {i} with prefix length {len_prefix[i]}: total samples: {len(outputs[i])}, Unique samples: {len(set(outputs[i]))}"
+            )
+        results = []
+        for i in range(num_prompts):
+            results.append(len(set(outputs[i])))
+        return results
+    elif args.test_mode == "radix_cache":
+        # Radix mode requires logprobs to compare results
+        args.return_logprob = True
+        print("\n=== Prefill Cache Consistency Test ===")
+        print(
+            "This test verifies prefill request produces consistent logprobs w/ and w/o cache.\n"
+        )
+        # We noticed that we cannot call flush cache before any request, otherwise it will hang.
+        warmup_response = send_single(
+            args, input_ids=[1] * 64, max_new_tokens=65, return_full_response=True
+        )
+        # Flush cache first to make sure there is no cache hit from previous tests
+        flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
+        print(f"Step 1: Generating random 64 token IDs...")
+        # Use a reasonable token ID range (e.g., 1-50000 for most tokenizers)
+        # Avoid special tokens like 0 (padding), 1 (BOS), 2 (EOS)
+        # set seed for random.randint
+        random.seed(42)
+        initial_token_ids = [random.randint(100, 50000) for _ in range(64)]
+        print(f"✓ Using {len(initial_token_ids)} initial tokens")
+        print(f"  Initial token IDs: {initial_token_ids}")
+        print(
+            f"\nStep 2: Generating 2 tokens from {len(initial_token_ids)} token prefix..."
+        )
+        first_response = send_single(
+            args,
+            input_ids=initial_token_ids,
+            max_new_tokens=100,
+            return_full_response=True,
+        )
+        first_output_text = first_response["text"]
+        first_output_token_ids = first_response["output_ids"]
+        first_output_logprobs = first_response["meta_info"]["output_token_logprobs"]
+        expected_token_id = first_output_token_ids[-1]
+        expected_logprob = first_output_logprobs[-1][0]
+        print(f"✓ Generated {len(first_output_token_ids)} tokens")
+        print(f'  Output text: "{first_output_text}"')
+        print(
+            f"\nStep 3: Generating with radix cache (164 tokens prefill, should hit > 128 tokens cache, based on page size)..."
+        )
+        prefix_token_ids = initial_token_ids + first_output_token_ids[:-1]
+        print(
+            f"  Prefix: {len(initial_token_ids)} initial + 64 generated = {len(prefix_token_ids)} tokens"
+        )
+        print(f"Using Prompt: {prefix_token_ids}")
+        cached_response = send_single(
+            args,
+            input_ids=prefix_token_ids,
+            max_new_tokens=1,
+            return_full_response=True,
+        )
+        cached_logprobs = cached_response["meta_info"]["output_token_logprobs"]
+        cached_token_data = cached_logprobs[0]
+        cached_logprob = cached_token_data[0]
+        cached_token_id = cached_token_data[1]
+        print(f"✓ Generated with cache:")
+        print(f"  Token ID: {cached_token_id}")
+        print(f"  Logprob:  {cached_logprob:.10f}")
+        print(f"\nStep 4: Flushing cache...")
+        flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
+        print(
+            f"\nStep 5: Generating without cache (same 164 tokens prefill, no cache)..."
+        )
+        print(f"Using Prompt: {prefix_token_ids}")
+        uncached_response = send_single(
+            args,
+            input_ids=prefix_token_ids,
+            max_new_tokens=1,
+            return_full_response=True,
+        )
+        uncached_logprobs = uncached_response["meta_info"]["output_token_logprobs"]
+        uncached_token_data = uncached_logprobs[0]
+        uncached_logprob = uncached_token_data[0]
+        uncached_token_id = uncached_token_data[1]
+        print(f"✓ Generated without cache:")
+        print(f"  Token ID: {uncached_token_id}")
+        print(f"  Logprob:  {uncached_logprob:.10f}")
+        # Step 6: Compare results
+        print(f"\n{'='*60}")
+        print("Comparison 1: Decode (Request 1) vs Prefill with Cache (Request 2)")
+        print("=" * 60)
+        # Compare first request (decode) vs second request (prefill with cache)
+        # We expect them to be different (different kernels)
+        decode_vs_prefill_token_match = expected_token_id == cached_token_id
+        decode_vs_prefill_logprob_match = expected_logprob == cached_logprob
+        print(
+            f"  Decode token (Request 1):          ID={expected_token_id}, logprob={expected_logprob:.10f}"
+        )
+        print(
+            f"  Prefill w/ cache token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}"
+        )
+        print(
+            f"  Token ID match: {'✓ YES' if decode_vs_prefill_token_match else '✗ NO'}"
+        )
+        print(
+            f"  Logprob match:  {'✓ YES' if decode_vs_prefill_logprob_match else '✗ NO'}"
+        )
+        if not decode_vs_prefill_logprob_match:
+            diff = abs(expected_logprob - cached_logprob)
+            print(f"  Logprob difference: {diff:.10e}")
+        print(f"  Note: We expect these to be DIFFERENT (decode vs prefill kernels)")
+        print(f"\n{'='*60}")
+        print(
+            "Comparison 2: Cached Prefill (Request 2) vs Uncached Prefill (Request 3)"
+        )
+        print("=" * 60)
+        # Main test: compare cached vs uncached prefill (should be identical)
+        token_match = cached_token_id == uncached_token_id
+        logprob_match = cached_logprob == uncached_logprob
+        print(
+            f"  Cached prefill token (Request 2):   ID={cached_token_id}, logprob={cached_logprob:.10f}"
+        )
+        print(
+            f"  Uncached prefill token (Request 3): ID={uncached_token_id}, logprob={uncached_logprob:.10f}"
+        )
+        print(f"  Token ID match: {'✓ YES' if token_match else '✗ NO'}")
+        if not token_match:
+            print(f"    Cached:   {cached_token_id}")
+            print(f"    Uncached: {uncached_token_id}")
+        print(f"  Logprob match:  {'✓ YES' if logprob_match else '✗ NO'}")
+        if not logprob_match:
+            print(f"    Cached:   {cached_logprob:.10f}")
+            print(f"    Uncached: {uncached_logprob:.10f}")
+            diff = abs(cached_logprob - uncached_logprob)
+            print(f"    Difference: {diff:.10e}")
+        print(f"  Note: We expect these to be IDENTICAL (both prefill kernels)")
+        print(f"\n{'='*60}")
+        if token_match and logprob_match:
+            print("✓✓✓ TEST PASSED - Radix cache is consistent! ✓✓✓")
+            return [1]
+        else:
+            print("✗✗✗ TEST FAILED - Radix cache produces different results! ✗✗✗")
+            return [0]
+    else:
+        raise ValueError(f"Invalid test mode: {args.test_mode}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    if args.sampling_seed is None:
+        args.sampling_seed = 42
+    test_deterministic(args)

sglang/test/test_deterministic_utils.py ADDED Viewed

@@ -0,0 +1,73 @@
+import unittest
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_deterministic import BenchArgs, test_deterministic
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+DEFAULT_MODEL = "Qwen/Qwen3-8B"
+COMMON_SERVER_ARGS = [
+    "--trust-remote-code",
+    "--cuda-graph-max-bs",
+    "32",
+    "--enable-deterministic-inference",
+]
+class TestDeterministicBase(CustomTestCase):
+    @classmethod
+    def get_server_args(cls):
+        return COMMON_SERVER_ARGS
+    @classmethod
+    def get_model(cls):
+        return DEFAULT_MODEL
+    @classmethod
+    def setUpClass(cls):
+        cls.model = cls.get_model()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        if "--attention-backend" not in cls.get_server_args():
+            raise unittest.SkipTest("Skip the base test class")
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=cls.get_server_args(),
+        )
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+    def _extract_host_and_port(self, url):
+        return url.split("://")[-1].split(":")[0], int(url.split(":")[-1])
+    def test_single(self):
+        args = BenchArgs()
+        url = DEFAULT_URL_FOR_TEST
+        args.host, args.port = self._extract_host_and_port(url)
+        args.test_mode = "single"
+        args.n_start = 10
+        args.n_trials = 20
+        results = test_deterministic(args)
+        args.temperature = 0.5  # test for deterministic sampling
+        for result in results:
+            assert result == 1
+    def test_prefix(self):
+        args = BenchArgs()
+        url = DEFAULT_URL_FOR_TEST
+        args.host, args.port = self._extract_host_and_port(url)
+        args.test_mode = "prefix"
+        args.n_start = 10
+        args.n_trials = 10
+        args.temperature = 0.5  # test for deterministic sampling
+        results = test_deterministic(args)
+        for result in results:
+            assert result == 1

sglang/test/test_disaggregation_utils.py CHANGED Viewed

@@ -1,20 +1,56 @@
+import logging
+import os
 import time
+import warnings
+from urllib.parse import urlparse
 import requests
+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    is_in_ci,
     popen_with_error_check,
 )
+logger = logging.getLogger(__name__)
 class TestDisaggregationBase(CustomTestCase):
     @classmethod
     def setUpClass(cls):
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
         cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
-        pass
+        # config transfer backend and rdma devices
+        if is_in_ci():
+            cls.transfer_backend = ["--disaggregation-transfer-backend", "mooncake"]
+            cls.rdma_devices = ["--disaggregation-ib-device", get_rdma_devices_args()]
+        else:
+            cls.transfer_backend = [
+                "--disaggregation-transfer-backend",
+                envs.SGLANG_TEST_PD_DISAGG_BACKEND.get(),
+            ]
+            cls.rdma_devices = [
+                "--disaggregation-ib-device",
+                envs.SGLANG_TEST_PD_DISAGG_DEVICES.get(),
+            ]
+            if cls.rdma_devices[1] is None:
+                cls.rdma_devices = []
+                msg = "No RDMA devices specified for disaggregation test, using default settings."
+                warnings.warn(msg)
     @classmethod
     def launch_lb(cls):
@@ -64,3 +100,59 @@ class TestDisaggregationBase(CustomTestCase):
         # wait for 5 seconds
         time.sleep(5)
+def get_rdma_devices_args():
+    def _parse_list_env(var_name: str):
+        val = os.getenv(var_name)
+        if not val:
+            return None
+        items = [x.strip() for x in val.split(",") if x.strip()]
+        return items or None
+    def _pick_default_pair(rdma_all_devices):
+        return [rdma_all_devices[0], rdma_all_devices[len(rdma_all_devices) // 2]]
+    rdma_all_devices = _parse_list_env("SGLANG_CI_RDMA_ALL_DEVICES") or [
+        f"mlx5_roce{i}" for i in range(8)
+    ]
+    logger.info("Resolved rdma_all_devices=%s", rdma_all_devices)
+    n_rdma = len(rdma_all_devices)
+    # 1. Get visible GPU indices
+    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+    if not cuda_visible_devices:
+        warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
+        return ",".join(_pick_default_pair(rdma_all_devices))
+    try:
+        # Convert to list of integers (handling possible spaces and empty strings)
+        gpu_indices = [
+            int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
+        ]
+        if not gpu_indices or len(gpu_indices) > 4:
+            return ",".join(_pick_default_pair(rdma_all_devices))
+    except ValueError:
+        warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
+        return ",".join(_pick_default_pair(rdma_all_devices))
+    # 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
+    base_rdma_group = (min(gpu_indices) // 4) * 4
+    for gpu_idx in gpu_indices:
+        if not (base_rdma_group <= gpu_idx < base_rdma_group + 4):
+            warnings.warn(
+                f"GPU index {gpu_idx} is outside expected group "
+                f"{base_rdma_group}-{base_rdma_group+3}"
+            )
+    # 3. Generate RDMA device names
+    rdma_devices = []
+    for gpu_idx in gpu_indices:
+        nic_index = gpu_idx // (8 // n_rdma)
+        rdma_devices.append(rdma_all_devices[nic_index])
+    if not rdma_devices:
+        return ",".join(_pick_default_pair(rdma_all_devices))
+    return ",".join(rdma_devices)

sglang/test/test_marlin_moe.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import types
 from typing import Optional
 import pytest

sglang/test/test_programs.py CHANGED Viewed

@@ -551,7 +551,7 @@ def test_gen_min_new_tokens():
     We verify that the number of tokens in the answer is >= the min_tokens threshold.
     """
     import sglang as sgl
-    from sglang.srt.hf_transformers_utils import get_tokenizer
+    from sglang.srt.utils.hf_transformers_utils import get_tokenizer
     model_path = sgl.global_config.default_backend.endpoint.get_model_name()
     MIN_TOKENS, MAX_TOKENS = 64, 128

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl