sglang 0.5.3rc2__py3-none-any.whl β 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints β grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper β deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper β deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper β deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py β ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py β utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py β utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py β utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info β sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info β sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper β deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py β utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py β utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info β sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info β sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info β sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py
CHANGED
|
@@ -16,11 +16,10 @@ import unittest
|
|
|
16
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
17
17
|
from dataclasses import dataclass
|
|
18
18
|
from datetime import datetime
|
|
19
|
-
from functools import partial
|
|
19
|
+
from functools import partial, wraps
|
|
20
20
|
from pathlib import Path
|
|
21
21
|
from types import SimpleNamespace
|
|
22
22
|
from typing import Any, Awaitable, Callable, List, Optional, Tuple
|
|
23
|
-
from urllib.parse import quote
|
|
24
23
|
|
|
25
24
|
import aiohttp
|
|
26
25
|
import numpy as np
|
|
@@ -76,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
|
|
|
76
75
|
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
|
|
77
76
|
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
|
78
77
|
|
|
78
|
+
# INT4 models
|
|
79
|
+
DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
|
|
80
|
+
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
|
|
81
|
+
)
|
|
82
|
+
|
|
79
83
|
# EAGLE
|
|
80
84
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
|
81
85
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
|
@@ -122,7 +126,12 @@ def is_in_ci():
|
|
|
122
126
|
|
|
123
127
|
def is_in_amd_ci():
|
|
124
128
|
"""Return whether it is in an AMD CI runner."""
|
|
125
|
-
return get_bool_env_var("
|
|
129
|
+
return get_bool_env_var("SGLANG_IS_IN_CI_AMD")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def is_blackwell_system():
|
|
133
|
+
"""Return whether it is running on a Blackwell (B200) system."""
|
|
134
|
+
return get_bool_env_var("IS_BLACKWELL")
|
|
126
135
|
|
|
127
136
|
|
|
128
137
|
def _use_cached_default_models(model_repo: str):
|
|
@@ -136,17 +145,20 @@ def _use_cached_default_models(model_repo: str):
|
|
|
136
145
|
|
|
137
146
|
if is_in_ci():
|
|
138
147
|
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
|
|
139
|
-
|
|
148
|
+
10000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
|
|
140
149
|
)
|
|
141
150
|
else:
|
|
142
151
|
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
|
|
143
|
-
|
|
152
|
+
20000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 1000
|
|
144
153
|
)
|
|
145
154
|
DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
|
|
146
155
|
|
|
147
156
|
if is_in_amd_ci():
|
|
148
157
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
|
|
149
158
|
|
|
159
|
+
if is_blackwell_system():
|
|
160
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
|
|
161
|
+
|
|
150
162
|
|
|
151
163
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
|
152
164
|
assert url is not None
|
|
@@ -397,8 +409,6 @@ def _get_call_generate(args: argparse.Namespace):
|
|
|
397
409
|
return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
|
|
398
410
|
elif args.backend == "srt-raw":
|
|
399
411
|
return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
|
|
400
|
-
elif args.backend == "gserver":
|
|
401
|
-
return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
|
|
402
412
|
elif args.backend == "outlines":
|
|
403
413
|
return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
|
|
404
414
|
elif args.backend == "guidance":
|
|
@@ -504,11 +514,12 @@ def popen_launch_server(
|
|
|
504
514
|
base_url: str,
|
|
505
515
|
timeout: float,
|
|
506
516
|
api_key: Optional[str] = None,
|
|
507
|
-
other_args: list[str] =
|
|
517
|
+
other_args: Optional[list[str]] = None,
|
|
508
518
|
env: Optional[dict] = None,
|
|
509
519
|
return_stdout_stderr: Optional[tuple] = None,
|
|
510
520
|
device: str = "auto",
|
|
511
521
|
pd_separated: bool = False,
|
|
522
|
+
num_replicas: Optional[int] = None,
|
|
512
523
|
):
|
|
513
524
|
"""Launch a server process with automatic device detection.
|
|
514
525
|
|
|
@@ -516,17 +527,19 @@ def popen_launch_server(
|
|
|
516
527
|
device: Device type ("auto", "cuda", "rocm" or "cpu").
|
|
517
528
|
If "auto", will detect available platforms automatically.
|
|
518
529
|
"""
|
|
530
|
+
other_args = other_args or []
|
|
531
|
+
|
|
519
532
|
# Auto-detect device if needed
|
|
520
533
|
if device == "auto":
|
|
521
534
|
device = auto_config_device()
|
|
522
|
-
print(f"Auto-configed device: {device}", flush=True)
|
|
523
535
|
other_args = list(other_args)
|
|
524
536
|
other_args += ["--device", str(device)]
|
|
525
537
|
|
|
526
538
|
_, host, port = base_url.split(":")
|
|
527
539
|
host = host[2:]
|
|
528
540
|
|
|
529
|
-
|
|
541
|
+
use_mixed_pd_engine = not pd_separated and num_replicas is not None
|
|
542
|
+
if pd_separated or use_mixed_pd_engine:
|
|
530
543
|
command = "sglang.launch_pd_server"
|
|
531
544
|
else:
|
|
532
545
|
command = "sglang.launch_server"
|
|
@@ -540,7 +553,7 @@ def popen_launch_server(
|
|
|
540
553
|
*[str(x) for x in other_args],
|
|
541
554
|
]
|
|
542
555
|
|
|
543
|
-
if pd_separated:
|
|
556
|
+
if pd_separated or use_mixed_pd_engine:
|
|
544
557
|
command.extend(
|
|
545
558
|
[
|
|
546
559
|
"--lb-host",
|
|
@@ -559,6 +572,15 @@ def popen_launch_server(
|
|
|
559
572
|
]
|
|
560
573
|
)
|
|
561
574
|
|
|
575
|
+
if use_mixed_pd_engine:
|
|
576
|
+
command.extend(
|
|
577
|
+
[
|
|
578
|
+
"--mixed",
|
|
579
|
+
"--num-replicas",
|
|
580
|
+
str(num_replicas),
|
|
581
|
+
]
|
|
582
|
+
)
|
|
583
|
+
|
|
562
584
|
if api_key:
|
|
563
585
|
command += ["--api-key", api_key]
|
|
564
586
|
|
|
@@ -597,7 +619,6 @@ def popen_launch_server(
|
|
|
597
619
|
start_time = time.perf_counter()
|
|
598
620
|
with requests.Session() as session:
|
|
599
621
|
while time.perf_counter() - start_time < timeout:
|
|
600
|
-
|
|
601
622
|
return_code = process.poll()
|
|
602
623
|
if return_code is not None:
|
|
603
624
|
# Server failed to start (non-zero exit code) or crashed
|
|
@@ -1149,7 +1170,7 @@ def run_bench_offline_throughput(model, other_args):
|
|
|
1149
1170
|
*[str(x) for x in other_args],
|
|
1150
1171
|
]
|
|
1151
1172
|
|
|
1152
|
-
print(f"{command
|
|
1173
|
+
print(f"command={' '.join(command)}")
|
|
1153
1174
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
1154
1175
|
|
|
1155
1176
|
try:
|
|
@@ -1608,6 +1629,9 @@ class CustomTestCase(unittest.TestCase):
|
|
|
1608
1629
|
max_retry=max_retry,
|
|
1609
1630
|
)
|
|
1610
1631
|
|
|
1632
|
+
def setUp(self):
|
|
1633
|
+
print(f"[Test Method] {self._testMethodName}", flush=True)
|
|
1634
|
+
|
|
1611
1635
|
|
|
1612
1636
|
def dump_bench_raw_result(
|
|
1613
1637
|
path: str,
|
|
@@ -1641,15 +1665,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
|
|
|
1641
1665
|
return text.removesuffix(suffix)
|
|
1642
1666
|
|
|
1643
1667
|
|
|
1644
|
-
class
|
|
1645
|
-
def __init__(
|
|
1668
|
+
class ModelLaunchSettings:
|
|
1669
|
+
def __init__(
|
|
1670
|
+
self,
|
|
1671
|
+
model_path: str,
|
|
1672
|
+
tp_size: int = 1,
|
|
1673
|
+
extra_args: Optional[List[str]] = None,
|
|
1674
|
+
env: Optional[dict] = None,
|
|
1675
|
+
):
|
|
1646
1676
|
self.model_path = model_path
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1677
|
+
self.tp_size = tp_size
|
|
1678
|
+
self.extra_args = list(extra_args) if extra_args else []
|
|
1679
|
+
self.env = env
|
|
1680
|
+
|
|
1681
|
+
if self.tp_size > 1 and "--tp" not in self.extra_args:
|
|
1682
|
+
self.extra_args.extend(["--tp", str(self.tp_size)])
|
|
1651
1683
|
|
|
1652
|
-
|
|
1684
|
+
fixed_args = ["--enable-multimodal", "--trust-remote-code"]
|
|
1685
|
+
for fixed_arg in fixed_args:
|
|
1686
|
+
if fixed_arg not in self.extra_args:
|
|
1687
|
+
self.extra_args.append(fixed_arg)
|
|
1653
1688
|
|
|
1654
1689
|
|
|
1655
1690
|
class ModelEvalMetrics:
|
|
@@ -1782,3 +1817,33 @@ def write_results_to_json(model, metrics, mode="a"):
|
|
|
1782
1817
|
|
|
1783
1818
|
with open("results.json", "w") as f:
|
|
1784
1819
|
json.dump(existing_results, f, indent=2)
|
|
1820
|
+
|
|
1821
|
+
|
|
1822
|
+
def intel_amx_benchmark(extra_args=None, min_throughput=None):
|
|
1823
|
+
def decorator(test_func):
|
|
1824
|
+
@wraps(test_func)
|
|
1825
|
+
def wrapper(self):
|
|
1826
|
+
common_args = [
|
|
1827
|
+
"--attention-backend",
|
|
1828
|
+
"intel_amx",
|
|
1829
|
+
"--disable-radix",
|
|
1830
|
+
"--trust-remote-code",
|
|
1831
|
+
]
|
|
1832
|
+
full_args = common_args + (extra_args or [])
|
|
1833
|
+
|
|
1834
|
+
model = test_func(self)
|
|
1835
|
+
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
|
|
1836
|
+
model, full_args
|
|
1837
|
+
)
|
|
1838
|
+
|
|
1839
|
+
print(f"{model=}")
|
|
1840
|
+
print(f"{prefill_latency=}")
|
|
1841
|
+
print(f"{decode_throughput=}")
|
|
1842
|
+
print(f"{decode_latency=}")
|
|
1843
|
+
|
|
1844
|
+
if is_in_ci() and min_throughput is not None:
|
|
1845
|
+
self.assertGreater(decode_throughput, min_throughput)
|
|
1846
|
+
|
|
1847
|
+
return wrapper
|
|
1848
|
+
|
|
1849
|
+
return decorator
|
sglang/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.4"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sglang
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: SGLang is a fast serving framework for large language models and vision language models.
|
|
5
5
|
License: Apache License
|
|
6
6
|
Version 2.0, January 2004
|
|
@@ -218,10 +218,11 @@ Requires-Dist: blobfile==3.0.0
|
|
|
218
218
|
Requires-Dist: build
|
|
219
219
|
Requires-Dist: compressed-tensors
|
|
220
220
|
Requires-Dist: cuda-python
|
|
221
|
+
Requires-Dist: decord2
|
|
221
222
|
Requires-Dist: datasets
|
|
222
223
|
Requires-Dist: einops
|
|
223
224
|
Requires-Dist: fastapi
|
|
224
|
-
Requires-Dist: flashinfer_python==0.4.
|
|
225
|
+
Requires-Dist: flashinfer_python==0.4.1
|
|
225
226
|
Requires-Dist: hf_transfer
|
|
226
227
|
Requires-Dist: huggingface_hub
|
|
227
228
|
Requires-Dist: interegular
|
|
@@ -243,34 +244,37 @@ Requires-Dist: psutil
|
|
|
243
244
|
Requires-Dist: py-spy
|
|
244
245
|
Requires-Dist: pybase64
|
|
245
246
|
Requires-Dist: pydantic
|
|
246
|
-
Requires-Dist:
|
|
247
|
+
Requires-Dist: nvidia-ml-py
|
|
247
248
|
Requires-Dist: python-multipart
|
|
248
249
|
Requires-Dist: pyzmq>=25.1.2
|
|
249
250
|
Requires-Dist: requests
|
|
250
251
|
Requires-Dist: scipy
|
|
251
252
|
Requires-Dist: sentencepiece
|
|
252
253
|
Requires-Dist: setproctitle
|
|
253
|
-
Requires-Dist: sgl-kernel==0.3.
|
|
254
|
+
Requires-Dist: sgl-kernel==0.3.16.post3
|
|
254
255
|
Requires-Dist: soundfile==0.13.1
|
|
255
256
|
Requires-Dist: tiktoken
|
|
256
257
|
Requires-Dist: timm==1.0.16
|
|
257
258
|
Requires-Dist: torch==2.8.0
|
|
258
|
-
Requires-Dist: torch_memory_saver==0.0.
|
|
259
|
+
Requires-Dist: torch_memory_saver==0.0.9
|
|
259
260
|
Requires-Dist: torchao==0.9.0
|
|
260
261
|
Requires-Dist: torchaudio==2.8.0
|
|
261
262
|
Requires-Dist: torchvision
|
|
262
263
|
Requires-Dist: tqdm
|
|
263
|
-
Requires-Dist: transformers==4.57.
|
|
264
|
+
Requires-Dist: transformers==4.57.1
|
|
264
265
|
Requires-Dist: uvicorn
|
|
265
266
|
Requires-Dist: uvloop
|
|
266
|
-
Requires-Dist: xgrammar==0.1.
|
|
267
|
+
Requires-Dist: xgrammar==0.1.25
|
|
267
268
|
Requires-Dist: grpcio==1.75.1
|
|
268
269
|
Requires-Dist: grpcio-tools==1.75.1
|
|
269
|
-
|
|
270
|
-
Requires-Dist:
|
|
270
|
+
Requires-Dist: grpcio-reflection==1.75.1
|
|
271
|
+
Requires-Dist: grpcio-health-checking==1.75.1
|
|
272
|
+
Provides-Extra: modelopt
|
|
273
|
+
Requires-Dist: nvidia-modelopt; extra == "modelopt"
|
|
271
274
|
Provides-Extra: test
|
|
272
275
|
Requires-Dist: accelerate; extra == "test"
|
|
273
276
|
Requires-Dist: expecttest; extra == "test"
|
|
277
|
+
Requires-Dist: gguf; extra == "test"
|
|
274
278
|
Requires-Dist: jsonlines; extra == "test"
|
|
275
279
|
Requires-Dist: matplotlib; extra == "test"
|
|
276
280
|
Requires-Dist: pandas; extra == "test"
|
|
@@ -278,24 +282,28 @@ Requires-Dist: peft; extra == "test"
|
|
|
278
282
|
Requires-Dist: pytest; extra == "test"
|
|
279
283
|
Requires-Dist: sentence_transformers; extra == "test"
|
|
280
284
|
Requires-Dist: tabulate; extra == "test"
|
|
285
|
+
Provides-Extra: checkpoint-engine
|
|
286
|
+
Requires-Dist: checkpoint-engine==0.1.2; extra == "checkpoint-engine"
|
|
287
|
+
Provides-Extra: all
|
|
288
|
+
Provides-Extra: dev
|
|
289
|
+
Requires-Dist: sglang[test]; extra == "dev"
|
|
290
|
+
Provides-Extra: cu130
|
|
291
|
+
Requires-Dist: torch==2.9.0; extra == "cu130"
|
|
292
|
+
Requires-Dist: torchaudio==2.9.0; extra == "cu130"
|
|
293
|
+
Requires-Dist: torchvision==0.24.0; extra == "cu130"
|
|
294
|
+
Provides-Extra: cu130-all
|
|
295
|
+
Requires-Dist: sglang[test]; extra == "cu130-all"
|
|
296
|
+
Requires-Dist: sglang[decord]; extra == "cu130-all"
|
|
297
|
+
Requires-Dist: sglang[cu130]; extra == "cu130-all"
|
|
281
298
|
Provides-Extra: tracing
|
|
282
299
|
Requires-Dist: opentelemetry-api; extra == "tracing"
|
|
283
300
|
Requires-Dist: opentelemetry-exporter-otlp; extra == "tracing"
|
|
284
301
|
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc; extra == "tracing"
|
|
285
302
|
Requires-Dist: opentelemetry-sdk; extra == "tracing"
|
|
286
|
-
Provides-Extra: all
|
|
287
|
-
Requires-Dist: sglang[test]; extra == "all"
|
|
288
|
-
Requires-Dist: sglang[decord]; extra == "all"
|
|
289
|
-
Provides-Extra: all-aarch64
|
|
290
|
-
Requires-Dist: sglang[test]; extra == "all-aarch64"
|
|
291
|
-
Provides-Extra: dev
|
|
292
|
-
Requires-Dist: sglang[test]; extra == "dev"
|
|
293
|
-
Requires-Dist: sglang[decord]; extra == "dev"
|
|
294
303
|
Provides-Extra: blackwell
|
|
295
|
-
Requires-Dist: sglang[
|
|
296
|
-
Requires-Dist: sglang[decord]; extra == "blackwell"
|
|
304
|
+
Requires-Dist: sglang[dev]; extra == "blackwell"
|
|
297
305
|
Provides-Extra: blackwell-aarch64
|
|
298
|
-
Requires-Dist: sglang[
|
|
306
|
+
Requires-Dist: sglang[dev]; extra == "blackwell-aarch64"
|
|
299
307
|
Dynamic: license-file
|
|
300
308
|
|
|
301
309
|
<div align="center" id="sglangtop">
|
|
@@ -320,18 +328,20 @@ Dynamic: license-file
|
|
|
320
328
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
|
321
329
|
|
|
322
330
|
## News
|
|
331
|
+
- [2025/09] π₯ Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part II): 3.8x Prefill, 4.8x Decode Throughput ([blog](https://lmsys.org/blog/2025-09-25-gb200-part-2/)).
|
|
332
|
+
- [2025/09] π₯ SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
|
|
323
333
|
- [2025/08] π SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
|
|
324
|
-
- [2025/08]
|
|
325
|
-
- [2025/
|
|
326
|
-
- [2025/06] π₯ Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
|
327
|
-
- [2025/05] π₯ Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
|
328
|
-
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
|
334
|
+
- [2025/08] SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
|
335
|
+
- [2025/05] Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
|
|
329
336
|
- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
|
|
330
337
|
- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
|
331
338
|
|
|
332
339
|
<details>
|
|
333
340
|
<summary>More</summary>
|
|
334
341
|
|
|
342
|
+
- [2025/06] SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
|
343
|
+
- [2025/06] Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|
|
344
|
+
- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
|
|
335
345
|
- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinctβ’ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
|
|
336
346
|
- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
|
337
347
|
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
|
@@ -344,14 +354,15 @@ Dynamic: license-file
|
|
|
344
354
|
</details>
|
|
345
355
|
|
|
346
356
|
## About
|
|
347
|
-
SGLang is a
|
|
348
|
-
It
|
|
349
|
-
|
|
357
|
+
SGLang is a high-performance serving framework for large language models and vision-language models.
|
|
358
|
+
It is designed to deliver low-latency and high-throughput inference across a wide range of setups, from a single GPU to large distributed clusters.
|
|
359
|
+
Its core features include:
|
|
350
360
|
|
|
351
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-
|
|
352
|
-
- **
|
|
353
|
-
- **Extensive
|
|
354
|
-
- **
|
|
361
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, a zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-LoRA batching.
|
|
362
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GLM, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse), and reward models (Skywork), with easy extensibility for integrating new models. Compatible with most Hugging Face models and OpenAI APIs.
|
|
363
|
+
- **Extensive Hardware Support**: Runs on NVIDIA GPUs (GB200/B300/H100/A100/Spark), AMD GPUs (MI355/MI300), Intel Xeon CPUs, Google TPUs, Ascend NPUs, and more.
|
|
364
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, supporting chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
|
365
|
+
- **Active Community**: SGLang is open-source and supported by a vibrant community with widespread industry adoption, powering over 300,000 GPUs worldwide.
|
|
355
366
|
|
|
356
367
|
## Getting Started
|
|
357
368
|
- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
|
|
@@ -367,7 +378,8 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
|
367
378
|
[Development Roadmap (2025 H2)](https://github.com/sgl-project/sglang/issues/7736)
|
|
368
379
|
|
|
369
380
|
## Adoption and Sponsorship
|
|
370
|
-
SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over
|
|
381
|
+
SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 300,000 GPUs worldwide.
|
|
382
|
+
SGLang is currently hosted under the non-profit open-source organization [LMSYS](https://lmsys.org/about/).
|
|
371
383
|
|
|
372
384
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
|
373
385
|
|