sglang 0.4.3.post1__tar.gz → 0.4.3.post3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.3.post1/sglang.egg-info → sglang-0.4.3.post3}/PKG-INFO +21 -15
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/README.md +9 -5
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/pyproject.toml +44 -17
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/api.py +1 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/bench_offline_throughput.py +19 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/bench_one_batch.py +2 -2
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/bench_serving.py +123 -79
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/global_config.py +8 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/runtime_endpoint.py +1 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/ir.py +1 -1
- sglang-0.4.3.post3/sglang/srt/_custom_ops.py +148 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/load_config.py +4 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/model_config.py +48 -2
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang-0.4.3.post3/sglang/srt/constrained/base_grammar_backend.py +194 -0
- sglang-0.4.3.post3/sglang/srt/constrained/llguidance_backend.py +151 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/constrained/outlines_backend.py +24 -33
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/constrained/xgrammar_backend.py +69 -38
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/parallel_state.py +48 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/entrypoints/engine.py +67 -9
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/entrypoints/http_server.py +190 -41
- sglang-0.4.3.post3/sglang/srt/entrypoints/verl_engine.py +147 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/function_call_parser.py +0 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/activation.py +11 -0
- sglang-0.4.3.post1/sglang/srt/layers/attention/__init__.py → sglang-0.4.3.post3/sglang/srt/layers/attention/base_attn_backend.py +14 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/flashinfer_backend.py +208 -295
- sglang-0.4.3.post3/sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/torch_native_backend.py +1 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/triton_backend.py +9 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang-0.4.3.post3/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang-0.4.3.post3/sglang/srt/layers/attention/utils.py +39 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/vision.py +60 -63
- sglang-0.4.3.post3/sglang/srt/layers/dp_attention.py +212 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/layernorm.py +1 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/linear.py +3 -1
- sglang-0.4.3.post3/sglang/srt/layers/logits_processor.py +567 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang-0.4.3.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/topk.py +13 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/__init__.py +111 -7
- sglang-0.4.3.post3/sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/fp8.py +69 -28
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang-0.4.3.post3/sglang/srt/layers/quantization/gptq.py +416 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/int8_kernel.py +381 -0
- sglang-0.4.3.post3/sglang/srt/layers/quantization/int8_utils.py +73 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/radix_attention.py +1 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/rotary_embedding.py +0 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/sampler.py +76 -31
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/lora.py +17 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/lora_config.py +5 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/lora_manager.py +1 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/cache_controller.py +193 -62
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/configure_logging.py +2 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang-0.4.3.post3/sglang/srt/managers/detokenizer_manager.py +267 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/image_processor.py +2 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/io_struct.py +143 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/schedule_batch.py +238 -197
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/schedule_policy.py +29 -29
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/scheduler.py +681 -259
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/session_controller.py +6 -2
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/tokenizer_manager.py +224 -68
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/tp_worker.py +15 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang-0.4.3.post3/sglang/srt/mem_cache/hiradix_cache.py +394 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/mem_cache/memory_pool.py +44 -18
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/mem_cache/radix_cache.py +58 -47
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/metrics/collector.py +94 -36
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_executor/cuda_graph_runner.py +55 -24
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_executor/forward_batch_info.py +49 -16
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_executor/model_runner.py +209 -28
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_loader/loader.py +3 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_loader/weight_utils.py +36 -14
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/baichuan.py +31 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/chatglm.py +39 -7
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/commandr.py +29 -5
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/dbrx.py +31 -5
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/deepseek.py +43 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/deepseek_nextn.py +32 -19
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/deepseek_v2.py +265 -29
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/exaone.py +19 -9
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/gemma.py +22 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/gemma2.py +25 -12
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/gemma2_reward.py +5 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/gpt2.py +28 -13
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/gpt_bigcode.py +27 -5
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/granite.py +21 -9
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/grok.py +21 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/internlm2.py +36 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/internlm2_reward.py +5 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llama.py +26 -9
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llama_classification.py +5 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llama_eagle.py +17 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llama_embedding.py +5 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llama_reward.py +7 -2
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llava.py +19 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/llavavid.py +10 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/minicpm.py +26 -2
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/minicpm3.py +39 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/minicpmv.py +45 -14
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/mixtral.py +20 -9
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/mixtral_quant.py +50 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/mllama.py +57 -11
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/olmo.py +34 -6
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/olmo2.py +34 -13
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/olmoe.py +26 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/phi3_small.py +29 -10
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/qwen.py +26 -3
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/qwen2.py +26 -4
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/qwen2_5_vl.py +46 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/qwen2_eagle.py +17 -5
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/qwen2_moe.py +44 -6
- sglang-0.4.3.post3/sglang/srt/models/qwen2_rm.py +78 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/qwen2_vl.py +39 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/stablelm.py +32 -5
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/torch_native_llama.py +5 -2
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/xverse.py +21 -9
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/xverse_moe.py +45 -7
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/yivl.py +2 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/openai_api/adapter.py +109 -24
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/openai_api/protocol.py +17 -1
- sglang-0.4.3.post3/sglang/srt/reasoning_parser.py +154 -0
- sglang-0.4.3.post3/sglang/srt/sampling/penaltylib/__init__.py +11 -0
- sglang-0.4.3.post3/sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- {sglang-0.4.3.post1/sglang/srt/sampling/penaltylib/penalizers → sglang-0.4.3.post3/sglang/srt/sampling/penaltylib}/min_new_tokens.py +15 -23
- sglang-0.4.3.post3/sglang/srt/sampling/penaltylib/orchestrator.py +197 -0
- sglang-0.4.3.post3/sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/sampling/sampling_batch_info.py +79 -157
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/sampling/sampling_params.py +16 -13
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/server_args.py +136 -52
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/speculative/build_eagle_tree.py +2 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/speculative/eagle_utils.py +92 -58
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/speculative/eagle_worker.py +186 -94
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/speculative/spec_info.py +1 -13
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/utils.py +43 -17
- sglang-0.4.3.post3/sglang/srt/warmup.py +47 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/few_shot_gsm8k.py +4 -1
- sglang-0.4.3.post3/sglang/test/runners.py +670 -0
- sglang-0.4.3.post3/sglang/test/send_one.py +88 -0
- sglang-0.4.3.post3/sglang/test/test_block_fp8_ep.py +361 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/test_programs.py +1 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/test_utils.py +138 -84
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/utils.py +50 -60
- sglang-0.4.3.post3/sglang/version.py +1 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3/sglang.egg-info}/PKG-INFO +21 -15
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang.egg-info/SOURCES.txt +56 -8
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang.egg-info/requires.txt +11 -9
- sglang-0.4.3.post1/sglang/bench_latency.py +0 -1
- sglang-0.4.3.post1/sglang/srt/_custom_ops.py +0 -156
- sglang-0.4.3.post1/sglang/srt/constrained/base_grammar_backend.py +0 -92
- sglang-0.4.3.post1/sglang/srt/layers/dp_attention.py +0 -71
- sglang-0.4.3.post1/sglang/srt/layers/logits_processor.py +0 -331
- sglang-0.4.3.post1/sglang/srt/layers/quantization/int8_kernel.py +0 -54
- sglang-0.4.3.post1/sglang/srt/managers/detokenizer_manager.py +0 -245
- sglang-0.4.3.post1/sglang/srt/sampling/penaltylib/__init__.py +0 -13
- sglang-0.4.3.post1/sglang/srt/sampling/penaltylib/orchestrator.py +0 -346
- sglang-0.4.3.post1/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang-0.4.3.post1/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang-0.4.3.post1/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang-0.4.3.post1/sglang/test/runners.py +0 -407
- sglang-0.4.3.post1/sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- sglang-0.4.3.post1/sglang/version.py +0 -1
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/LICENSE +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/setup.cfg +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/check_env.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/chat_template.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/choices.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/launch_server.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/conversation.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/custom_op.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/parameter.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- /sglang-0.4.3.post1/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → /sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- /sglang-0.4.3.post1/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → /sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- /sglang-0.4.3.post1/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → /sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- /sglang-0.4.3.post1/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → /sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- /sglang-0.4.3.post1/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → /sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- /sglang-0.4.3.post1/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → /sglang-0.4.3.post3/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/backend/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/backend/base_backend.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/backend/flashinfer_backend.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/backend/triton_backend.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/layers.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/mem_pool.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/triton_ops/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/triton_ops/gate_up_lora_b.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/triton_ops/qkv_lora_b.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/triton_ops/sgemm_lora_a.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/triton_ops/sgemm_lora_b.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/lora/utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/server.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.3.post1 → sglang-0.4.3.post3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.3.
|
3
|
+
Version: 0.4.3.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -235,32 +235,34 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
235
235
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
236
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
237
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
238
|
-
Requires-Dist: xgrammar==0.1.
|
238
|
+
Requires-Dist: xgrammar==0.1.14; extra == "runtime-common"
|
239
239
|
Requires-Dist: ninja; extra == "runtime-common"
|
240
|
+
Requires-Dist: transformers==4.48.3; extra == "runtime-common"
|
241
|
+
Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
|
240
242
|
Provides-Extra: srt
|
241
243
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
|
-
Requires-Dist:
|
243
|
-
Requires-Dist:
|
244
|
-
Requires-Dist: torch; extra == "srt"
|
244
|
+
Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt"
|
245
|
+
Requires-Dist: flashinfer_python==0.2.2.post1; extra == "srt"
|
246
|
+
Requires-Dist: torch==2.5.1; extra == "srt"
|
245
247
|
Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
|
246
|
-
Requires-Dist:
|
248
|
+
Requires-Dist: cuda-python; extra == "srt"
|
247
249
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
248
250
|
Provides-Extra: srt-hip
|
249
251
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
252
|
+
Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt-hip"
|
250
253
|
Requires-Dist: torch; extra == "srt-hip"
|
251
254
|
Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
|
252
255
|
Requires-Dist: outlines==0.1.11; extra == "srt-hip"
|
253
|
-
Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt-hip"
|
254
256
|
Provides-Extra: srt-xpu
|
255
257
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
256
|
-
Requires-Dist: outlines
|
258
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-xpu"
|
257
259
|
Provides-Extra: srt-hpu
|
258
260
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
259
|
-
Requires-Dist: outlines
|
261
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
|
260
262
|
Provides-Extra: srt-cpu
|
261
263
|
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
264
|
+
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
|
262
265
|
Requires-Dist: torch; extra == "srt-cpu"
|
263
|
-
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-cpu"
|
264
266
|
Provides-Extra: openai
|
265
267
|
Requires-Dist: openai>=1.0; extra == "openai"
|
266
268
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -318,7 +320,7 @@ Provides-Extra: dev-cpu
|
|
318
320
|
Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
|
319
321
|
Requires-Dist: sglang[test]; extra == "dev-cpu"
|
320
322
|
|
321
|
-
<div align="center"
|
323
|
+
<div align="center" id="sglangtop">
|
322
324
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
323
325
|
|
324
326
|
[](https://pypi.org/project/sglang)
|
@@ -336,10 +338,11 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
|
|
336
338
|
| [**Documentation**](https://docs.sglang.ai/)
|
337
339
|
| [**Join Slack**](https://slack.sglang.ai/)
|
338
340
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
341
|
+
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
|
339
342
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
340
343
|
|
341
344
|
## News
|
342
|
-
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
|
345
|
+
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
343
346
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
344
347
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
345
348
|
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -366,7 +369,7 @@ The core features include:
|
|
366
369
|
|
367
370
|
## Getting Started
|
368
371
|
- [Install SGLang](https://docs.sglang.ai/start/install.html)
|
369
|
-
- [Quick Start](https://docs.sglang.ai/
|
372
|
+
- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
|
370
373
|
- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
|
371
374
|
- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
|
372
375
|
- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
|
@@ -375,10 +378,13 @@ The core features include:
|
|
375
378
|
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
376
379
|
|
377
380
|
## Roadmap
|
378
|
-
[Development Roadmap (
|
381
|
+
[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
|
379
382
|
|
380
383
|
## Adoption and Sponsorship
|
381
|
-
The project
|
384
|
+
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
385
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
386
|
+
|
387
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
382
388
|
|
383
389
|
## Contact Us
|
384
390
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
<div align="center"
|
1
|
+
<div align="center" id="sglangtop">
|
2
2
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
3
3
|
|
4
4
|
[](https://pypi.org/project/sglang)
|
@@ -16,10 +16,11 @@
|
|
16
16
|
| [**Documentation**](https://docs.sglang.ai/)
|
17
17
|
| [**Join Slack**](https://slack.sglang.ai/)
|
18
18
|
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
19
|
+
| [**Roadmap**](https://github.com/sgl-project/sglang/issues/4042)
|
19
20
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
20
21
|
|
21
22
|
## News
|
22
|
-
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
|
23
|
+
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
|
23
24
|
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
24
25
|
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
25
26
|
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
@@ -46,7 +47,7 @@ The core features include:
|
|
46
47
|
|
47
48
|
## Getting Started
|
48
49
|
- [Install SGLang](https://docs.sglang.ai/start/install.html)
|
49
|
-
- [Quick Start](https://docs.sglang.ai/
|
50
|
+
- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
|
50
51
|
- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
|
51
52
|
- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
|
52
53
|
- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
|
@@ -55,10 +56,13 @@ The core features include:
|
|
55
56
|
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
56
57
|
|
57
58
|
## Roadmap
|
58
|
-
[Development Roadmap (
|
59
|
+
[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
|
59
60
|
|
60
61
|
## Adoption and Sponsorship
|
61
|
-
The project
|
62
|
+
The project has been deployed to large-scale production, generating trillions of tokens every day.
|
63
|
+
It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
|
64
|
+
|
65
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
62
66
|
|
63
67
|
## Contact Us
|
64
68
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.3.
|
7
|
+
version = "0.4.3.post3"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -17,32 +17,56 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
|
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
19
|
runtime_common = [
|
20
|
-
"aiohttp",
|
21
|
-
"
|
22
|
-
"
|
23
|
-
"
|
24
|
-
"
|
20
|
+
"aiohttp",
|
21
|
+
"decord",
|
22
|
+
"fastapi",
|
23
|
+
"hf_transfer",
|
24
|
+
"huggingface_hub",
|
25
|
+
"interegular",
|
26
|
+
"modelscope",
|
27
|
+
"orjson",
|
28
|
+
"packaging",
|
29
|
+
"pillow",
|
30
|
+
"prometheus-client>=0.20.0",
|
31
|
+
"psutil",
|
32
|
+
"pydantic",
|
33
|
+
"python-multipart",
|
34
|
+
"pyzmq>=25.1.2",
|
35
|
+
"torchao>=0.7.0",
|
36
|
+
"uvicorn",
|
37
|
+
"uvloop",
|
38
|
+
"xgrammar==0.1.14",
|
39
|
+
"ninja",
|
40
|
+
"transformers==4.48.3",
|
41
|
+
"llguidance>=0.6.15"
|
25
42
|
]
|
43
|
+
|
26
44
|
srt = [
|
27
|
-
"sglang[runtime_common]",
|
28
|
-
"sgl-kernel
|
29
|
-
"flashinfer_python
|
45
|
+
"sglang[runtime_common]",
|
46
|
+
"sgl-kernel==0.0.3.post6",
|
47
|
+
"flashinfer_python==0.2.2.post1",
|
48
|
+
"torch==2.5.1",
|
49
|
+
"vllm>=0.6.4.post1,<=0.7.2",
|
50
|
+
"cuda-python",
|
30
51
|
"outlines>=0.0.44,<=0.1.11",
|
31
52
|
]
|
32
53
|
|
33
54
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
34
|
-
# => base docker rocm/vllm-dev:
|
35
|
-
srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"
|
55
|
+
# => base docker rocm/vllm-dev:20250114, not from public vllm whl
|
56
|
+
srt_hip = ["sglang[runtime_common]", "sgl-kernel==0.0.3.post6", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
|
57
|
+
|
36
58
|
# xpu is not enabled in public vllm and torch whl,
|
37
59
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
38
|
-
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44
|
39
|
-
|
40
|
-
#
|
41
|
-
|
60
|
+
srt_xpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
61
|
+
|
62
|
+
# For Intel Gaudi(device : hpu) follow the installation guide
|
63
|
+
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
64
|
+
srt_hpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11"]
|
65
|
+
|
42
66
|
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
43
67
|
# To install vllm for CPU, please follow the instruction here:
|
44
68
|
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
45
|
-
srt_cpu = ["sglang[runtime_common]", "
|
69
|
+
srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
|
46
70
|
|
47
71
|
openai = ["openai>=1.0", "tiktoken"]
|
48
72
|
anthropic = ["anthropic>=0.20.0"]
|
@@ -73,7 +97,10 @@ dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
|
|
73
97
|
"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
|
74
98
|
|
75
99
|
[tool.setuptools.package-data]
|
76
|
-
"sglang" = [
|
100
|
+
"sglang" = [
|
101
|
+
"srt/layers/moe/fused_moe_triton/configs/*.json",
|
102
|
+
"srt/layers/quantization/configs/*.json",
|
103
|
+
]
|
77
104
|
|
78
105
|
[tool.setuptools.packages.find]
|
79
106
|
exclude = [
|
@@ -94,7 +94,7 @@ def gen(
|
|
94
94
|
regex: Optional[str] = None,
|
95
95
|
json_schema: Optional[str] = None,
|
96
96
|
):
|
97
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
97
|
+
"""Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
|
98
98
|
|
99
99
|
if choices:
|
100
100
|
return SglSelect(
|
@@ -56,6 +56,7 @@ class BenchArgs:
|
|
56
56
|
profile: bool = False
|
57
57
|
skip_warmup: bool = False
|
58
58
|
do_not_exit: bool = False
|
59
|
+
prompt_suffix: str = ""
|
59
60
|
|
60
61
|
@staticmethod
|
61
62
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -177,6 +178,12 @@ class BenchArgs:
|
|
177
178
|
action="store_true",
|
178
179
|
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
179
180
|
)
|
181
|
+
parser.add_argument(
|
182
|
+
"--prompt-suffix",
|
183
|
+
type=str,
|
184
|
+
default="",
|
185
|
+
help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
|
186
|
+
)
|
180
187
|
|
181
188
|
@classmethod
|
182
189
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -216,6 +223,10 @@ def throughput_test_once(
|
|
216
223
|
]
|
217
224
|
|
218
225
|
if profile:
|
226
|
+
assert (
|
227
|
+
"SGLANG_TORCH_PROFILER_DIR" in os.environ
|
228
|
+
), "Please set SGLANG_TORCH_PROFILER_DIR."
|
229
|
+
os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
|
219
230
|
backend.start_profile()
|
220
231
|
|
221
232
|
st = time.perf_counter()
|
@@ -229,6 +240,8 @@ def throughput_test_once(
|
|
229
240
|
if backend_name == "runtime":
|
230
241
|
gen_out = json.loads(gen_out)
|
231
242
|
|
243
|
+
server_info = backend.get_server_info()
|
244
|
+
|
232
245
|
measurement_results["total_latency"] = latency
|
233
246
|
measurement_results["total_output_tokens"] = sum(
|
234
247
|
o["meta_info"]["completion_tokens"] for o in gen_out
|
@@ -246,6 +259,7 @@ def throughput_test_once(
|
|
246
259
|
measurement_results["total_input_tokens"]
|
247
260
|
+ measurement_results["total_output_tokens"]
|
248
261
|
) / latency
|
262
|
+
measurement_results["last_gen_throughput"] = server_info["last_gen_throughput"]
|
249
263
|
|
250
264
|
return measurement_results
|
251
265
|
|
@@ -361,6 +375,11 @@ def throughput_test(
|
|
361
375
|
print(
|
362
376
|
"{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
|
363
377
|
)
|
378
|
+
print(
|
379
|
+
"{:<40} {:<10.2f}".format(
|
380
|
+
"Last generation throughput (tok/s):", result["last_gen_throughput"]
|
381
|
+
)
|
382
|
+
)
|
364
383
|
print(
|
365
384
|
"{:<40} {:<10.2f}".format(
|
366
385
|
"Request throughput (req/s):", result["request_throughput"]
|
@@ -230,7 +230,7 @@ def extend(reqs, model_runner):
|
|
230
230
|
batch = ScheduleBatch.init_new(
|
231
231
|
reqs=reqs,
|
232
232
|
req_to_token_pool=model_runner.req_to_token_pool,
|
233
|
-
|
233
|
+
token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
|
234
234
|
tree_cache=None,
|
235
235
|
model_config=model_runner.model_config,
|
236
236
|
enable_overlap=False,
|
@@ -326,7 +326,7 @@ def latency_test_run_once(
|
|
326
326
|
|
327
327
|
# Clear the pools.
|
328
328
|
model_runner.req_to_token_pool.clear()
|
329
|
-
model_runner.
|
329
|
+
model_runner.token_to_kv_pool_allocator.clear()
|
330
330
|
|
331
331
|
measurement_results = {
|
332
332
|
"run_name": run_name,
|