sglang 0.4.1.post7__tar.gz → 0.4.2.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.1.post7/sglang.egg-info → sglang-0.4.2.post1}/PKG-INFO +8 -8
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/README.md +6 -6
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/pyproject.toml +2 -2
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_offline_throughput.py +17 -11
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_one_batch.py +14 -6
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_serving.py +47 -44
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/chat_template.py +31 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/load_config.py +1 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/entrypoints/engine.py +5 -2
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/entrypoints/http_server.py +24 -0
- sglang-0.4.2.post1/sglang/srt/function_call_parser.py +494 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/activation.py +5 -5
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
- sglang-0.4.2.post1/sglang/srt/layers/attention/vision.py +407 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/dp_attention.py +3 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/layernorm.py +5 -5
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/linear.py +24 -9
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/logits_processor.py +1 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/layer.py +20 -12
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_native.py +17 -3
- sglang-0.4.2.post1/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/parameter.py +16 -7
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang-0.4.2.post1/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8.py +11 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/rotary_embedding.py +34 -13
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/sampler.py +33 -10
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/torchao_utils.py +12 -6
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/detokenizer_manager.py +1 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/image_processor.py +77 -38
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/io_struct.py +36 -5
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/schedule_batch.py +31 -25
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/scheduler.py +78 -38
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/tokenizer_manager.py +4 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/base_prefix_cache.py +4 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/chunk_cache.py +3 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/radix_cache.py +30 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_executor/cuda_graph_runner.py +23 -25
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_executor/forward_batch_info.py +5 -7
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_executor/model_runner.py +7 -4
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/loader.py +75 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/weight_utils.py +91 -5
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/commandr.py +14 -2
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/dbrx.py +9 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/deepseek_v2.py +3 -3
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gemma2.py +9 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/grok.py +1 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/minicpm3.py +3 -3
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/minicpmv.py +129 -76
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mllama.py +16 -56
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2.py +4 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_vl.py +18 -8
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/torch_native_llama.py +17 -4
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/openai_api/adapter.py +139 -37
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/openai_api/protocol.py +5 -4
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/sampling_batch_info.py +4 -14
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/server.py +2 -2
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/server_args.py +26 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/eagle_utils.py +37 -15
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/eagle_worker.py +11 -13
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/utils.py +62 -67
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_programs.py +1 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_utils.py +81 -22
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/utils.py +42 -0
- sglang-0.4.2.post1/sglang/version.py +1 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1/sglang.egg-info}/PKG-INFO +8 -8
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/SOURCES.txt +11 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/requires.txt +1 -1
- sglang-0.4.1.post7/sglang/srt/layers/attention/vision.py +0 -204
- sglang-0.4.1.post7/sglang/version.py +0 -1
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/LICENSE +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/setup.cfg +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/api.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_latency.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/bench_one_batch_server.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/check_env.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/global_config.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/launch_server.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/_custom_ops.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/device_config.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/constrained/xgrammar_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/conversation.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/communication_op.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/parallel_state.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/distributed/utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/custom_op_util.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/moe/topk.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/modelopt_quant.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/quantization/w8a8_int8.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/cache_controller.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/configure_logging.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/session_controller.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/managers/utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/granite.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/olmo2.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/custom_logit_processor.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/runners.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.1.post7 → sglang-0.4.2.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.2.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -240,7 +240,7 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
|
|
240
240
|
Provides-Extra: srt
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
242
|
Requires-Dist: cuda-python; extra == "srt"
|
243
|
-
Requires-Dist: sgl-kernel>=0.0.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
|
244
244
|
Requires-Dist: torch; extra == "srt"
|
245
245
|
Requires-Dist: vllm==0.6.4.post1; extra == "srt"
|
246
246
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
@@ -333,16 +333,16 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
|
|
333
333
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
334
334
|
|
335
335
|
## News
|
336
|
-
- [
|
337
|
-
- [2024/
|
338
|
-
- [2024/09]
|
339
|
-
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
336
|
+
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
|
337
|
+
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
338
|
+
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
339
|
+
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
340
340
|
|
341
341
|
<details>
|
342
342
|
<summary>More</summary>
|
343
343
|
|
344
|
+
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
344
345
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
345
|
-
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
346
346
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
347
347
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
348
348
|
|
@@ -372,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
372
372
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
373
373
|
|
374
374
|
## Adoption and Sponsorship
|
375
|
-
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
375
|
+
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
376
376
|
|
377
377
|
## Acknowledgment and Citation
|
378
378
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -19,16 +19,16 @@
|
|
19
19
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
20
20
|
|
21
21
|
## News
|
22
|
-
- [
|
23
|
-
- [2024/
|
24
|
-
- [2024/09]
|
25
|
-
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
22
|
+
- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
|
23
|
+
- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
|
24
|
+
- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
25
|
+
- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
26
26
|
|
27
27
|
<details>
|
28
28
|
<summary>More</summary>
|
29
29
|
|
30
|
+
- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
30
31
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
31
|
-
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
32
32
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
33
33
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
34
34
|
|
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
58
58
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
59
59
|
|
60
60
|
## Adoption and Sponsorship
|
61
|
-
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
61
|
+
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
62
62
|
|
63
63
|
## Acknowledgment and Citation
|
64
64
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.
|
7
|
+
version = "0.4.2.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -27,7 +27,7 @@ runtime_common = [
|
|
27
27
|
]
|
28
28
|
srt = [
|
29
29
|
"sglang[runtime_common]", "cuda-python",
|
30
|
-
"sgl-kernel>=0.0.
|
30
|
+
"sgl-kernel>=0.0.3", "torch", "vllm==0.6.4.post1",
|
31
31
|
"flashinfer==0.1.6"
|
32
32
|
]
|
33
33
|
|
@@ -49,12 +49,13 @@ class BenchArgs:
|
|
49
49
|
gsp_system_prompt_len: int = 2048
|
50
50
|
gsp_question_len: int = 128
|
51
51
|
gsp_output_len: int = 256
|
52
|
+
seed: int = 1
|
52
53
|
disable_ignore_eos: bool = False
|
53
54
|
extra_request_body: Optional[str] = None
|
54
|
-
|
55
|
+
apply_chat_template: bool = False
|
56
|
+
profile: bool = False
|
55
57
|
skip_warmup: bool = False
|
56
58
|
do_not_exit: bool = False
|
57
|
-
profile: bool = False
|
58
59
|
|
59
60
|
@staticmethod
|
60
61
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -141,20 +142,31 @@ class BenchArgs:
|
|
141
142
|
default=BenchArgs.gsp_output_len,
|
142
143
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
143
144
|
)
|
145
|
+
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
144
146
|
parser.add_argument(
|
145
147
|
"--disable-ignore-eos",
|
146
|
-
|
147
|
-
default=BenchArgs.disable_ignore_eos,
|
148
|
+
action="store_true",
|
148
149
|
help="Disable ignore EOS token",
|
149
150
|
)
|
150
151
|
parser.add_argument(
|
151
152
|
"--extra-request-body",
|
152
153
|
metavar='{"key1": "value1", "key2": "value2"}',
|
153
154
|
type=str,
|
155
|
+
default=BenchArgs.extra_request_body,
|
154
156
|
help="Append given JSON object to the request payload. You can use this to specify"
|
155
157
|
"additional generate params like sampling params.",
|
156
158
|
)
|
157
|
-
parser.add_argument(
|
159
|
+
parser.add_argument(
|
160
|
+
"--apply-chat-template",
|
161
|
+
action="store_true",
|
162
|
+
help="Apply chat template",
|
163
|
+
)
|
164
|
+
parser.add_argument(
|
165
|
+
"--profile",
|
166
|
+
action="store_true",
|
167
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
168
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
169
|
+
)
|
158
170
|
parser.add_argument(
|
159
171
|
"--skip-warmup",
|
160
172
|
action="store_true",
|
@@ -165,12 +177,6 @@ class BenchArgs:
|
|
165
177
|
action="store_true",
|
166
178
|
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
167
179
|
)
|
168
|
-
parser.add_argument(
|
169
|
-
"--profile",
|
170
|
-
action="store_true",
|
171
|
-
help="Use Torch Profiler. The endpoint must be launched with "
|
172
|
-
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
173
|
-
)
|
174
180
|
|
175
181
|
@classmethod
|
176
182
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -65,7 +65,13 @@ from sglang.srt.model_executor.model_runner import ModelRunner
|
|
65
65
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
66
66
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
67
67
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
68
|
-
from sglang.srt.utils import
|
68
|
+
from sglang.srt.utils import (
|
69
|
+
configure_logger,
|
70
|
+
get_bool_env_var,
|
71
|
+
kill_process_tree,
|
72
|
+
set_gpu_proc_affinity,
|
73
|
+
suppress_other_loggers,
|
74
|
+
)
|
69
75
|
|
70
76
|
|
71
77
|
@dataclasses.dataclass
|
@@ -99,10 +105,7 @@ class BenchArgs:
|
|
99
105
|
parser.add_argument("--correctness-test", action="store_true")
|
100
106
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
101
107
|
parser.add_argument(
|
102
|
-
"--profile",
|
103
|
-
action="store_true",
|
104
|
-
help="Use Torch Profiler. The endpoint must be launched with "
|
105
|
-
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
108
|
+
"--profile", action="store_true", help="Use Torch Profiler."
|
106
109
|
)
|
107
110
|
parser.add_argument(
|
108
111
|
"--profile-filename-prefix",
|
@@ -381,6 +384,7 @@ def latency_test_run_once(
|
|
381
384
|
parent_dir = os.path.dirname(os.path.abspath(profile_filename))
|
382
385
|
os.makedirs(parent_dir, exist_ok=True)
|
383
386
|
profiler.export_chrome_trace(profile_filename)
|
387
|
+
rank_print(f"torch profiler chrome trace saved to {profile_filename}")
|
384
388
|
|
385
389
|
# Record decode timing from 2nd output
|
386
390
|
if output_len > 1:
|
@@ -407,6 +411,10 @@ def latency_test(
|
|
407
411
|
bench_args,
|
408
412
|
tp_rank,
|
409
413
|
):
|
414
|
+
# Set CPU affinity
|
415
|
+
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
416
|
+
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
|
417
|
+
|
410
418
|
# Configure the logger
|
411
419
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
412
420
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
@@ -451,7 +459,7 @@ def latency_test(
|
|
451
459
|
il,
|
452
460
|
ol,
|
453
461
|
server_args.device,
|
454
|
-
bench_args.profile,
|
462
|
+
bench_args.profile if tp_rank == 0 else None,
|
455
463
|
bench_args.profile_filename_prefix,
|
456
464
|
)
|
457
465
|
if ret is not None:
|
@@ -453,6 +453,7 @@ def get_dataset(args, tokenizer):
|
|
453
453
|
tokenizer=tokenizer,
|
454
454
|
fixed_output_len=args.sharegpt_output_len,
|
455
455
|
context_len=args.sharegpt_context_len,
|
456
|
+
apply_chat_template=args.apply_chat_template,
|
456
457
|
)
|
457
458
|
elif args.dataset_name == "random":
|
458
459
|
input_requests = sample_random_requests(
|
@@ -517,6 +518,7 @@ class BenchmarkMetrics:
|
|
517
518
|
median_e2e_latency_ms: float
|
518
519
|
std_e2e_latency_ms: float
|
519
520
|
p99_e2e_latency_ms: float
|
521
|
+
concurrency: float
|
520
522
|
|
521
523
|
|
522
524
|
SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
|
@@ -562,6 +564,7 @@ def sample_sharegpt_requests(
|
|
562
564
|
tokenizer: PreTrainedTokenizerBase,
|
563
565
|
fixed_output_len: Optional[int] = None,
|
564
566
|
context_len: Optional[int] = None,
|
567
|
+
apply_chat_template=False,
|
565
568
|
) -> List[Tuple[str, int, int]]:
|
566
569
|
if fixed_output_len is not None and fixed_output_len < 4:
|
567
570
|
raise ValueError("output_len too small")
|
@@ -592,6 +595,15 @@ def sample_sharegpt_requests(
|
|
592
595
|
|
593
596
|
# Tokenize the prompts and completions.
|
594
597
|
prompt = dataset[i][0]
|
598
|
+
|
599
|
+
if apply_chat_template:
|
600
|
+
prompt = tokenizer.apply_chat_template(
|
601
|
+
[{"role": "user", "content": prompt}],
|
602
|
+
add_generation_prompt=True,
|
603
|
+
tokenize=False,
|
604
|
+
)
|
605
|
+
prompt = prompt.replace(tokenizer.bos_token, "")
|
606
|
+
|
595
607
|
prompt_token_ids = tokenizer.encode(prompt)
|
596
608
|
completion = dataset[i][1]
|
597
609
|
completion_token_ids = tokenizer.encode(completion)
|
@@ -600,7 +612,7 @@ def sample_sharegpt_requests(
|
|
600
612
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
601
613
|
)
|
602
614
|
|
603
|
-
if prompt_len <
|
615
|
+
if prompt_len < 2 or output_len < 2:
|
604
616
|
# Prune too short sequences.
|
605
617
|
continue
|
606
618
|
|
@@ -880,6 +892,7 @@ def calculate_metrics(
|
|
880
892
|
median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
|
881
893
|
std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
|
882
894
|
p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
|
895
|
+
concurrency=np.sum(e2e_latencies) / dur_s,
|
883
896
|
)
|
884
897
|
|
885
898
|
return metrics, output_lens
|
@@ -1031,6 +1044,7 @@ async def benchmark(
|
|
1031
1044
|
"Total token throughput (tok/s):", metrics.total_throughput
|
1032
1045
|
)
|
1033
1046
|
)
|
1047
|
+
print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
|
1034
1048
|
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
1035
1049
|
print(
|
1036
1050
|
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
@@ -1062,13 +1076,24 @@ async def benchmark(
|
|
1062
1076
|
and metrics.output_throughput is not None
|
1063
1077
|
):
|
1064
1078
|
result = {
|
1079
|
+
# Arguments
|
1065
1080
|
"backend": args.backend,
|
1066
1081
|
"dataset_name": args.dataset_name,
|
1067
1082
|
"request_rate": request_rate,
|
1068
1083
|
"max_concurrency": max_concurrency,
|
1084
|
+
"sharegpt_output_len": args.sharegpt_output_len,
|
1085
|
+
"random_input_len": args.random_input_len,
|
1086
|
+
"random_output_len": args.random_output_len,
|
1087
|
+
"random_range_ratio": args.random_range_ratio,
|
1088
|
+
# Results
|
1089
|
+
"duration": benchmark_duration,
|
1090
|
+
"completed": metrics.completed,
|
1069
1091
|
"total_input_tokens": metrics.total_input,
|
1070
1092
|
"total_output_tokens": metrics.total_output,
|
1071
1093
|
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
1094
|
+
"request_throughput": metrics.request_throughput,
|
1095
|
+
"input_throughput": metrics.input_throughput,
|
1096
|
+
"output_throughput": metrics.output_throughput,
|
1072
1097
|
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
1073
1098
|
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
1074
1099
|
"std_e2e_latency_ms": metrics.std_e2e_latency_ms,
|
@@ -1085,14 +1110,7 @@ async def benchmark(
|
|
1085
1110
|
"median_itl_ms": metrics.median_itl_ms,
|
1086
1111
|
"std_itl_ms": metrics.std_itl_ms,
|
1087
1112
|
"p99_itl_ms": metrics.p99_itl_ms,
|
1088
|
-
"
|
1089
|
-
"output_throughput": metrics.output_throughput,
|
1090
|
-
"sharegpt_output_len": args.sharegpt_output_len,
|
1091
|
-
"random_input_len": args.random_input_len,
|
1092
|
-
"random_output_len": args.random_output_len,
|
1093
|
-
"random_range_ratio": args.random_range_ratio,
|
1094
|
-
"duration": benchmark_duration,
|
1095
|
-
"completed": metrics.completed,
|
1113
|
+
"concurrency": metrics.concurrency,
|
1096
1114
|
}
|
1097
1115
|
else:
|
1098
1116
|
print(f"Error running benchmark for request rate: {request_rate}")
|
@@ -1112,36 +1130,16 @@ async def benchmark(
|
|
1112
1130
|
with open(output_file_name, "a") as file:
|
1113
1131
|
file.write(json.dumps(result) + "\n")
|
1114
1132
|
|
1115
|
-
result
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
"median_ttft_ms": metrics.median_ttft_ms,
|
1126
|
-
"std_ttft_ms": metrics.std_ttft_ms,
|
1127
|
-
"p99_ttft_ms": metrics.p99_ttft_ms,
|
1128
|
-
"mean_tpot_ms": metrics.mean_tpot_ms,
|
1129
|
-
"median_tpot_ms": metrics.median_tpot_ms,
|
1130
|
-
"std_tpot_ms": metrics.std_tpot_ms,
|
1131
|
-
"p99_tpot_ms": metrics.p99_tpot_ms,
|
1132
|
-
"mean_itl_ms": metrics.mean_itl_ms,
|
1133
|
-
"median_itl_ms": metrics.median_itl_ms,
|
1134
|
-
"std_itl_ms": metrics.std_itl_ms,
|
1135
|
-
"p99_itl_ms": metrics.p99_itl_ms,
|
1136
|
-
"input_lens": [output.prompt_len for output in outputs],
|
1137
|
-
"output_lens": output_lens,
|
1138
|
-
"ttfts": [output.ttft for output in outputs],
|
1139
|
-
"itls": [output.itl for output in outputs],
|
1140
|
-
"generated_texts": [output.generated_text for output in outputs],
|
1141
|
-
"errors": [output.error for output in outputs],
|
1142
|
-
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
1143
|
-
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
1144
|
-
}
|
1133
|
+
result.update(
|
1134
|
+
{
|
1135
|
+
"input_lens": [output.prompt_len for output in outputs],
|
1136
|
+
"output_lens": output_lens,
|
1137
|
+
"ttfts": [output.ttft for output in outputs],
|
1138
|
+
"itls": [output.itl for output in outputs],
|
1139
|
+
"generated_texts": [output.generated_text for output in outputs],
|
1140
|
+
"errors": [output.error for output in outputs],
|
1141
|
+
}
|
1142
|
+
)
|
1145
1143
|
return result
|
1146
1144
|
|
1147
1145
|
|
@@ -1422,7 +1420,6 @@ if __name__ == "__main__":
|
|
1422
1420
|
"actual request rate may be lower than specified with --request-rate, "
|
1423
1421
|
"if the server is not processing requests fast enough to keep up.",
|
1424
1422
|
)
|
1425
|
-
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
1426
1423
|
parser.add_argument(
|
1427
1424
|
"--multi",
|
1428
1425
|
action="store_true",
|
@@ -1446,14 +1443,15 @@ if __name__ == "__main__":
|
|
1446
1443
|
help="Disable streaming mode.",
|
1447
1444
|
)
|
1448
1445
|
parser.add_argument(
|
1449
|
-
"--
|
1446
|
+
"--return-logprob",
|
1450
1447
|
action="store_true",
|
1451
|
-
help="
|
1448
|
+
help="Return logprob.",
|
1452
1449
|
)
|
1450
|
+
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
1453
1451
|
parser.add_argument(
|
1454
|
-
"--
|
1452
|
+
"--disable-ignore-eos",
|
1455
1453
|
action="store_true",
|
1456
|
-
help="
|
1454
|
+
help="Disable ignoring EOS.",
|
1457
1455
|
)
|
1458
1456
|
parser.add_argument(
|
1459
1457
|
"--extra-request-body",
|
@@ -1462,6 +1460,11 @@ if __name__ == "__main__":
|
|
1462
1460
|
help="Append given JSON object to the request payload. You can use this to specify"
|
1463
1461
|
"additional generate params like sampling params.",
|
1464
1462
|
)
|
1463
|
+
parser.add_argument(
|
1464
|
+
"--apply-chat-template",
|
1465
|
+
action="store_true",
|
1466
|
+
help="Apply chat template",
|
1467
|
+
)
|
1465
1468
|
parser.add_argument(
|
1466
1469
|
"--profile",
|
1467
1470
|
action="store_true",
|
@@ -354,6 +354,37 @@ register_chat_template(
|
|
354
354
|
)
|
355
355
|
|
356
356
|
|
357
|
+
register_chat_template(
|
358
|
+
ChatTemplate(
|
359
|
+
name="deepseek-v3",
|
360
|
+
default_system_prompt=None,
|
361
|
+
role_prefix_and_suffix={
|
362
|
+
"system": (
|
363
|
+
"",
|
364
|
+
"",
|
365
|
+
),
|
366
|
+
"user": (
|
367
|
+
"<|User|>",
|
368
|
+
"",
|
369
|
+
),
|
370
|
+
"assistant": (
|
371
|
+
"<|Assistant|>",
|
372
|
+
"<|end▁of▁sentence|>",
|
373
|
+
),
|
374
|
+
},
|
375
|
+
stop_str=("<|end▁of▁sentence|>",),
|
376
|
+
)
|
377
|
+
)
|
378
|
+
|
379
|
+
|
380
|
+
@register_chat_template_matching_function
|
381
|
+
def match_deepseek(model_path: str):
|
382
|
+
if (
|
383
|
+
"deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
|
384
|
+
) and "base" not in model_path.lower():
|
385
|
+
return get_chat_template("deepseek-v3")
|
386
|
+
|
387
|
+
|
357
388
|
@register_chat_template_matching_function
|
358
389
|
def match_dbrx(model_path: str):
|
359
390
|
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
@@ -185,9 +185,12 @@ class CustomAllreduce:
|
|
185
185
|
# test nvlink first, this will filter out most of the cases
|
186
186
|
# where custom allreduce is not supported
|
187
187
|
# this checks hardware and driver support for NVLink
|
188
|
-
|
188
|
+
if is_cuda():
|
189
|
+
assert is_cuda()
|
189
190
|
|
190
|
-
|
191
|
+
full_nvlink = is_full_nvlink(physical_device_ids)
|
192
|
+
else:
|
193
|
+
full_nvlink = False
|
191
194
|
if world_size > 2 and not full_nvlink:
|
192
195
|
logger.warning(
|
193
196
|
"Custom allreduce is disabled because it's not supported on"
|
@@ -57,6 +57,7 @@ from sglang.srt.utils import (
|
|
57
57
|
assert_pkg_version,
|
58
58
|
configure_logger,
|
59
59
|
kill_process_tree,
|
60
|
+
launch_dummy_health_check_server,
|
60
61
|
maybe_set_triton_cache_manager,
|
61
62
|
prepare_model_and_tokenizer,
|
62
63
|
set_prometheus_multiproc_dir,
|
@@ -400,14 +401,16 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
|
|
400
401
|
|
401
402
|
if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
|
402
403
|
# When using `Engine` as a Python API, we don't want to block here.
|
403
|
-
return
|
404
|
+
return None, None
|
405
|
+
|
406
|
+
launch_dummy_health_check_server(server_args.host, server_args.port)
|
404
407
|
|
405
408
|
for proc in scheduler_procs:
|
406
409
|
proc.join()
|
407
410
|
logger.error(
|
408
411
|
f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
|
409
412
|
)
|
410
|
-
return
|
413
|
+
return None, None
|
411
414
|
|
412
415
|
# Launch detokenizer process
|
413
416
|
detoken_proc = mp.Process(
|
@@ -39,10 +39,12 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
39
39
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
40
40
|
|
41
41
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
42
|
+
from sglang.srt.function_call_parser import FunctionCallParser
|
42
43
|
from sglang.srt.managers.io_struct import (
|
43
44
|
CloseSessionReqInput,
|
44
45
|
ConfigureLoggingReq,
|
45
46
|
EmbeddingReqInput,
|
47
|
+
FunctionCallReqInput,
|
46
48
|
GenerateReqInput,
|
47
49
|
GetWeightsByNameReqInput,
|
48
50
|
InitWeightsUpdateGroupReqInput,
|
@@ -369,6 +371,28 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
|
|
369
371
|
return Response(status_code=200)
|
370
372
|
|
371
373
|
|
374
|
+
@app.post("/function_call")
|
375
|
+
async def function_call_request(obj: FunctionCallReqInput, request: Request):
|
376
|
+
"""
|
377
|
+
A native API endpoint to parse function calls from a text.
|
378
|
+
"""
|
379
|
+
# 1) Initialize the parser based on the request body
|
380
|
+
parser = FunctionCallParser(tools=obj.tools, tool_call_parser=obj.tool_call_parser)
|
381
|
+
|
382
|
+
# 2) Call the non-stream parsing method (non-stream)
|
383
|
+
normal_text, calls = parser.parse_non_stream(obj.text)
|
384
|
+
|
385
|
+
# 3) Organize the response content
|
386
|
+
response_data = {
|
387
|
+
"normal_text": normal_text,
|
388
|
+
"calls": [
|
389
|
+
call.model_dump() for call in calls
|
390
|
+
], # Convert pydantic objects to dictionaries
|
391
|
+
}
|
392
|
+
|
393
|
+
return ORJSONResponse(content=response_data, status_code=200)
|
394
|
+
|
395
|
+
|
372
396
|
##### OpenAI-compatible API endpoints #####
|
373
397
|
|
374
398
|
|