sglang 0.4.1.post6__tar.gz → 0.4.1.post7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.4.1.post6/sglang.egg-info → sglang-0.4.1.post7}/PKG-INFO +16 -5
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/README.md +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/pyproject.toml +9 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/__init__.py +21 -23
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/api.py +2 -7
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_offline_throughput.py +24 -16
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_one_batch.py +51 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_one_batch_server.py +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_serving.py +37 -28
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/runtime_endpoint.py +183 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/chat_template.py +15 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/launch_server.py +1 -1
- sglang-0.4.1.post7/sglang/srt/_custom_ops.py +156 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/device_config.py +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/model_config.py +1 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/base_grammar_backend.py +21 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/xgrammar_backend.py +8 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/conversation.py +14 -1
- sglang-0.4.1.post7/sglang/srt/distributed/__init__.py +3 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/communication_op.py +2 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/parallel_state.py +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/distributed/utils.py +2 -1
- sglang-0.4.1.post7/sglang/srt/entrypoints/engine.py +449 -0
- sglang-0.4.1.post7/sglang/srt/entrypoints/http_server.py +579 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/activation.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang-0.4.1.post7/sglang/srt/layers/attention/vision.py +204 -0
- sglang-0.4.1.post7/sglang/srt/layers/dp_attention.py +69 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/linear.py +41 -5
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/logits_processor.py +48 -63
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/ep_moe/layer.py +4 -4
- sglang-0.4.1.post7/sglang/srt/layers/moe/fused_moe_native.py +115 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/parameter.py +2 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/__init__.py +20 -23
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/fp8.py +6 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/radix_attention.py +2 -2
- sglang-0.4.1.post7/sglang/srt/layers/rotary_embedding.py +1260 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/sampler.py +39 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/lora/lora.py +1 -9
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/configure_logging.py +3 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/data_parallel_controller.py +79 -72
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/detokenizer_manager.py +23 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/image_processor.py +158 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/io_struct.py +25 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/schedule_batch.py +49 -22
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/schedule_policy.py +26 -12
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/scheduler.py +277 -178
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/session_controller.py +1 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/tokenizer_manager.py +206 -121
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/tp_worker.py +6 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang-0.4.1.post7/sglang/srt/managers/utils.py +44 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/memory_pool.py +10 -32
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/metrics/collector.py +15 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_executor/cuda_graph_runner.py +4 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_executor/model_runner.py +37 -15
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/loader.py +8 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/weight_utils.py +55 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/baichuan.py +6 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/chatglm.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/commandr.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/dbrx.py +4 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/deepseek.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/deepseek_v2.py +8 -8
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/exaone.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gemma.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gemma2.py +6 -24
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gpt2.py +3 -5
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gpt_bigcode.py +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/granite.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/grok.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/internlm2.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama.py +7 -5
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/minicpm.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/minicpm3.py +6 -6
- sglang-0.4.1.post7/sglang/srt/models/minicpmv.py +1238 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mixtral.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mixtral_quant.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mllama.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/olmo.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/olmo2.py +4 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/olmoe.py +7 -13
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/phi3_small.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2.py +41 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2_moe.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2_vl.py +22 -122
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/stablelm.py +2 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/torch_native_llama.py +3 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/xverse.py +6 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/xverse_moe.py +6 -6
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/openai_api/protocol.py +2 -0
- sglang-0.4.1.post7/sglang/srt/sampling/custom_logit_processor.py +38 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/sampling_batch_info.py +139 -4
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/sampling_params.py +3 -1
- sglang-0.4.1.post6/sglang/srt/constrained/__init__.py → sglang-0.4.1.post7/sglang/srt/server.py +4 -2
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/server_args.py +57 -14
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/utils.py +103 -65
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/runners.py +8 -13
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_programs.py +1 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_utils.py +3 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/utils.py +12 -2
- sglang-0.4.1.post7/sglang/version.py +1 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7/sglang.egg-info}/PKG-INFO +16 -5
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/SOURCES.txt +7 -3
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/requires.txt +17 -3
- sglang-0.4.1.post6/sglang/launch_server_llavavid.py +0 -25
- sglang-0.4.1.post6/sglang/srt/_custom_ops.py +0 -118
- sglang-0.4.1.post6/sglang/srt/distributed/__init__.py +0 -3
- sglang-0.4.1.post6/sglang/srt/layers/moe/ep_moe/__init__.py +0 -0
- sglang-0.4.1.post6/sglang/srt/layers/moe/fused_moe_native.py +0 -46
- sglang-0.4.1.post6/sglang/srt/layers/rotary_embedding.py +0 -112
- sglang-0.4.1.post6/sglang/srt/server.py +0 -1104
- sglang-0.4.1.post6/sglang/version.py +0 -1
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/LICENSE +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/setup.cfg +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/bench_latency.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/check_env.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/global_config.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/choices.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/compiler.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/interpreter.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/ir.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/lang/tracer.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/llama3_eval.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/aio_rwlock.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/chatglm.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/dbrx.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/load_config.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/outlines_backend.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/custom_op_util.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.4.1.post6/sglang/srt/distributed/device_communicators → sglang-0.4.1.post7/sglang/srt/layers/moe/ep_moe}/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/ep_moe/kernels.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/moe/topk.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/fp8_kernel.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/fp8_utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/quantization/int8_kernel.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/managers/cache_controller.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_eagle.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llava.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/qwen2_eagle.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/registry.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/build_eagle_tree.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/eagle_utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/eagle_worker.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/speculative/spec_info.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/srt/torch_memory_saver_adapter.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/run_eval.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_activation.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_block_fp8.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.4.1.post6 → sglang-0.4.1.post7}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1.
|
3
|
+
Version: 0.4.1.post7
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -236,13 +236,13 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
236
236
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
237
237
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
238
238
|
Requires-Dist: uvloop; extra == "runtime-common"
|
239
|
-
Requires-Dist: xgrammar>=0.1.
|
239
|
+
Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
|
240
240
|
Provides-Extra: srt
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
242
|
Requires-Dist: cuda-python; extra == "srt"
|
243
|
-
Requires-Dist: sgl-kernel>=0.0.2.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
|
244
244
|
Requires-Dist: torch; extra == "srt"
|
245
|
-
Requires-Dist: vllm
|
245
|
+
Requires-Dist: vllm==0.6.4.post1; extra == "srt"
|
246
246
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
@@ -252,6 +252,9 @@ Provides-Extra: srt-xpu
|
|
252
252
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
253
253
|
Provides-Extra: srt-hpu
|
254
254
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
255
|
+
Provides-Extra: srt-cpu
|
256
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
|
257
|
+
Requires-Dist: torch; extra == "srt-cpu"
|
255
258
|
Provides-Extra: openai
|
256
259
|
Requires-Dist: openai>=1.0; extra == "openai"
|
257
260
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -288,6 +291,11 @@ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
|
288
291
|
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
289
292
|
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
290
293
|
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
294
|
+
Provides-Extra: all-cpu
|
295
|
+
Requires-Dist: sglang[srt_cpu]; extra == "all-cpu"
|
296
|
+
Requires-Dist: sglang[openai]; extra == "all-cpu"
|
297
|
+
Requires-Dist: sglang[anthropic]; extra == "all-cpu"
|
298
|
+
Requires-Dist: sglang[litellm]; extra == "all-cpu"
|
291
299
|
Provides-Extra: dev
|
292
300
|
Requires-Dist: sglang[all]; extra == "dev"
|
293
301
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -300,6 +308,9 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
|
|
300
308
|
Provides-Extra: dev-hpu
|
301
309
|
Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
|
302
310
|
Requires-Dist: sglang[test]; extra == "dev-hpu"
|
311
|
+
Provides-Extra: dev-cpu
|
312
|
+
Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
|
313
|
+
Requires-Dist: sglang[test]; extra == "dev-cpu"
|
303
314
|
|
304
315
|
<div align="center" id="sglangtop">
|
305
316
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -361,7 +372,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
361
372
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
362
373
|
|
363
374
|
## Adoption and Sponsorship
|
364
|
-
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
375
|
+
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
365
376
|
|
366
377
|
## Acknowledgment and Citation
|
367
378
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -58,7 +58,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
58
58
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
59
59
|
|
60
60
|
## Adoption and Sponsorship
|
61
|
-
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
61
|
+
The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
62
62
|
|
63
63
|
## Acknowledgment and Citation
|
64
64
|
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.4.1.
|
7
|
+
version = "0.4.1.post7"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,11 +23,11 @@ runtime_common = [
|
|
23
23
|
"packaging", "pillow", "prometheus-client>=0.20.0",
|
24
24
|
"psutil", "pydantic", "python-multipart",
|
25
25
|
"pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
|
26
|
-
"xgrammar>=0.1.
|
26
|
+
"xgrammar>=0.1.10"
|
27
27
|
]
|
28
28
|
srt = [
|
29
29
|
"sglang[runtime_common]", "cuda-python",
|
30
|
-
"sgl-kernel>=0.0.2.
|
30
|
+
"sgl-kernel>=0.0.2.post14", "torch", "vllm==0.6.4.post1",
|
31
31
|
"flashinfer==0.1.6"
|
32
32
|
]
|
33
33
|
|
@@ -40,6 +40,10 @@ srt_xpu = ["sglang[runtime_common]"]
|
|
40
40
|
#For Intel Gaudi(device : hpu) follow the installation guide
|
41
41
|
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
42
42
|
srt_hpu = ["sglang[runtime_common]"]
|
43
|
+
# CPU: currently, there are no pre-built vllm wheels for CPU.
|
44
|
+
# To install vllm for CPU, please follow the instruction here:
|
45
|
+
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
|
46
|
+
srt_cpu = ["sglang[runtime_common]", "torch"]
|
43
47
|
|
44
48
|
openai = ["openai>=1.0", "tiktoken"]
|
45
49
|
anthropic = ["anthropic>=0.20.0"]
|
@@ -57,11 +61,13 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
|
57
61
|
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
58
62
|
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
59
63
|
all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
64
|
+
all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
60
65
|
|
61
66
|
dev = ["sglang[all]", "sglang[test]"]
|
62
67
|
dev_hip = ["sglang[all_hip]", "sglang[test]"]
|
63
68
|
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
64
69
|
dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
|
70
|
+
dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
|
65
71
|
|
66
72
|
[project.urls]
|
67
73
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -1,5 +1,6 @@
|
|
1
|
-
#
|
1
|
+
# SGLang public APIs
|
2
2
|
|
3
|
+
# Frontend Language APIs
|
3
4
|
from sglang.api import (
|
4
5
|
Engine,
|
5
6
|
Runtime,
|
@@ -23,16 +24,26 @@ from sglang.api import (
|
|
23
24
|
user_end,
|
24
25
|
video,
|
25
26
|
)
|
27
|
+
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
26
28
|
from sglang.lang.choices import (
|
27
29
|
greedy_token_selection,
|
28
30
|
token_length_normalized,
|
29
31
|
unconditional_likelihood_normalized,
|
30
32
|
)
|
33
|
+
from sglang.utils import LazyImport
|
34
|
+
|
35
|
+
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
36
|
+
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
37
|
+
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
38
|
+
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
39
|
+
|
40
|
+
# Other configs
|
41
|
+
from sglang.global_config import global_config
|
42
|
+
from sglang.version import __version__
|
31
43
|
|
32
|
-
# SGLang DSL APIs
|
33
44
|
__all__ = [
|
34
|
-
"Runtime",
|
35
45
|
"Engine",
|
46
|
+
"Runtime",
|
36
47
|
"assistant",
|
37
48
|
"assistant_begin",
|
38
49
|
"assistant_end",
|
@@ -52,27 +63,14 @@ __all__ = [
|
|
52
63
|
"user_begin",
|
53
64
|
"user_end",
|
54
65
|
"video",
|
66
|
+
"RuntimeEndpoint",
|
55
67
|
"greedy_token_selection",
|
56
68
|
"token_length_normalized",
|
57
69
|
"unconditional_likelihood_normalized",
|
70
|
+
"Anthropic",
|
71
|
+
"LiteLLM",
|
72
|
+
"OpenAI",
|
73
|
+
"VertexAI",
|
74
|
+
"global_config",
|
75
|
+
"__version__",
|
58
76
|
]
|
59
|
-
|
60
|
-
# Global Configurations
|
61
|
-
from sglang.global_config import global_config
|
62
|
-
|
63
|
-
__all__ += ["global_config"]
|
64
|
-
|
65
|
-
from sglang.version import __version__
|
66
|
-
|
67
|
-
__all__ += ["__version__"]
|
68
|
-
|
69
|
-
# SGLang Backends
|
70
|
-
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
71
|
-
from sglang.utils import LazyImport
|
72
|
-
|
73
|
-
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
74
|
-
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
75
|
-
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
76
|
-
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
77
|
-
|
78
|
-
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
|
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Public APIs of the language."""
|
2
2
|
|
3
|
-
import os
|
4
3
|
import re
|
5
4
|
from typing import Callable, List, Optional, Union
|
6
5
|
|
@@ -33,19 +32,15 @@ def function(
|
|
33
32
|
|
34
33
|
|
35
34
|
def Runtime(*args, **kwargs):
|
36
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
37
|
-
|
38
35
|
# Avoid importing unnecessary dependency
|
39
|
-
from sglang.
|
36
|
+
from sglang.lang.backend.runtime_endpoint import Runtime
|
40
37
|
|
41
38
|
return Runtime(*args, **kwargs)
|
42
39
|
|
43
40
|
|
44
41
|
def Engine(*args, **kwargs):
|
45
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
46
|
-
|
47
42
|
# Avoid importing unnecessary dependency
|
48
|
-
from sglang.srt.
|
43
|
+
from sglang.srt.entrypoints.engine import Engine
|
49
44
|
|
50
45
|
return Engine(*args, **kwargs)
|
51
46
|
|
@@ -27,7 +27,8 @@ from sglang.bench_serving import (
|
|
27
27
|
sample_random_requests,
|
28
28
|
set_ulimit,
|
29
29
|
)
|
30
|
-
from sglang.
|
30
|
+
from sglang.lang.backend.runtime_endpoint import Runtime
|
31
|
+
from sglang.srt.entrypoints.engine import Engine
|
31
32
|
from sglang.srt.server_args import ServerArgs
|
32
33
|
|
33
34
|
|
@@ -39,14 +40,15 @@ class BenchArgs:
|
|
39
40
|
dataset_path: str = ""
|
40
41
|
num_prompts: int = 1000
|
41
42
|
sharegpt_output_len: Optional[int] = None
|
43
|
+
sharegpt_context_len: Optional[int] = None
|
42
44
|
random_input_len: int = 1024
|
43
45
|
random_output_len: int = 1024
|
44
46
|
random_range_ratio: float = 0.0
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
gsp_num_groups: int = 64
|
48
|
+
gsp_prompts_per_group: int = 16
|
49
|
+
gsp_system_prompt_len: int = 2048
|
50
|
+
gsp_question_len: int = 128
|
51
|
+
gsp_output_len: int = 256
|
50
52
|
disable_ignore_eos: bool = False
|
51
53
|
extra_request_body: Optional[str] = None
|
52
54
|
seed: int = 1
|
@@ -82,6 +84,12 @@ class BenchArgs:
|
|
82
84
|
default=BenchArgs.sharegpt_output_len,
|
83
85
|
help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
|
84
86
|
)
|
87
|
+
parser.add_argument(
|
88
|
+
"--sharegpt-context-len",
|
89
|
+
type=int,
|
90
|
+
default=BenchArgs.sharegpt_context_len,
|
91
|
+
help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
|
92
|
+
)
|
85
93
|
parser.add_argument(
|
86
94
|
"--random-input-len",
|
87
95
|
type=int,
|
@@ -102,35 +110,35 @@ class BenchArgs:
|
|
102
110
|
"used only for random dataset.",
|
103
111
|
)
|
104
112
|
parser.add_argument(
|
105
|
-
"--
|
113
|
+
"--gsp-num-groups",
|
106
114
|
type=int,
|
107
|
-
default=BenchArgs.
|
115
|
+
default=BenchArgs.gsp_num_groups,
|
108
116
|
help="Number of groups with shared prefix, used"
|
109
117
|
"only for generate-shared-prefix",
|
110
118
|
)
|
111
119
|
parser.add_argument(
|
112
|
-
"--
|
120
|
+
"--gsp-prompts-per-group",
|
113
121
|
type=int,
|
114
|
-
default=BenchArgs.
|
122
|
+
default=BenchArgs.gsp_prompts_per_group,
|
115
123
|
help="Number of prompts per group of shared prefix, used"
|
116
124
|
"only for generate-shared-prefix",
|
117
125
|
)
|
118
126
|
parser.add_argument(
|
119
|
-
"--
|
127
|
+
"--gsp-system-prompt-len",
|
120
128
|
type=int,
|
121
|
-
default=BenchArgs.
|
129
|
+
default=BenchArgs.gsp_system_prompt_len,
|
122
130
|
help="System prompt length, used" "only for generate-shared-prefix",
|
123
131
|
)
|
124
132
|
parser.add_argument(
|
125
|
-
"--
|
133
|
+
"--gsp-question-len",
|
126
134
|
type=int,
|
127
|
-
default=BenchArgs.
|
135
|
+
default=BenchArgs.gsp_question_len,
|
128
136
|
help="Question length, used" "only for generate-shared-prefix",
|
129
137
|
)
|
130
138
|
parser.add_argument(
|
131
|
-
"--
|
139
|
+
"--gsp-output-len",
|
132
140
|
type=int,
|
133
|
-
default=BenchArgs.
|
141
|
+
default=BenchArgs.gsp_output_len,
|
134
142
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
135
143
|
)
|
136
144
|
parser.add_argument(
|
@@ -9,7 +9,8 @@ It accepts server arguments (the same as launch_server.py) and benchmark argumen
|
|
9
9
|
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
10
10
|
## sweep through multiple data points and store (append) the results in a jsonl file:
|
11
11
|
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
|
12
|
-
|
12
|
+
## run with profiling:
|
13
|
+
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
|
13
14
|
# Usage (correctness test):
|
14
15
|
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
15
16
|
|
@@ -56,12 +57,12 @@ import torch
|
|
56
57
|
import torch.distributed as dist
|
57
58
|
|
58
59
|
from sglang.srt.configs.model_config import ModelConfig
|
60
|
+
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
59
61
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
60
62
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
61
63
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
62
64
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
63
65
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
64
|
-
from sglang.srt.server import _set_envs_and_config
|
65
66
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
66
67
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
67
68
|
from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
|
@@ -77,6 +78,8 @@ class BenchArgs:
|
|
77
78
|
correctness_test: bool = False
|
78
79
|
# This is only used for correctness test
|
79
80
|
cut_len: int = 4
|
81
|
+
profile: bool = False
|
82
|
+
profile_filename_prefix: str = "profile"
|
80
83
|
|
81
84
|
@staticmethod
|
82
85
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -95,6 +98,19 @@ class BenchArgs:
|
|
95
98
|
)
|
96
99
|
parser.add_argument("--correctness-test", action="store_true")
|
97
100
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
101
|
+
parser.add_argument(
|
102
|
+
"--profile",
|
103
|
+
action="store_true",
|
104
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
105
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
106
|
+
)
|
107
|
+
parser.add_argument(
|
108
|
+
"--profile-filename-prefix",
|
109
|
+
type=str,
|
110
|
+
default=BenchArgs.profile_filename_prefix,
|
111
|
+
help="Prefix of the profiling file names. The full profiling result file(s) be "
|
112
|
+
'"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
|
113
|
+
)
|
98
114
|
|
99
115
|
@classmethod
|
100
116
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -216,6 +232,7 @@ def extend(reqs, model_runner):
|
|
216
232
|
model_config=model_runner.model_config,
|
217
233
|
enable_overlap=False,
|
218
234
|
spec_algorithm=SpeculativeAlgorithm.NONE,
|
235
|
+
enable_custom_logit_processor=False,
|
219
236
|
)
|
220
237
|
batch.prepare_for_extend()
|
221
238
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -286,7 +303,16 @@ def synchronize(device):
|
|
286
303
|
|
287
304
|
|
288
305
|
def latency_test_run_once(
|
289
|
-
run_name,
|
306
|
+
run_name,
|
307
|
+
model_runner,
|
308
|
+
rank_print,
|
309
|
+
reqs,
|
310
|
+
batch_size,
|
311
|
+
input_len,
|
312
|
+
output_len,
|
313
|
+
device,
|
314
|
+
profile,
|
315
|
+
profile_filename_prefix,
|
290
316
|
):
|
291
317
|
max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
|
292
318
|
if batch_size > max_batch_size:
|
@@ -308,6 +334,17 @@ def latency_test_run_once(
|
|
308
334
|
|
309
335
|
tot_latency = 0
|
310
336
|
|
337
|
+
profiler = None
|
338
|
+
if profile:
|
339
|
+
profiler = torch.profiler.profile(
|
340
|
+
activities=[
|
341
|
+
torch.profiler.ProfilerActivity.CPU,
|
342
|
+
torch.profiler.ProfilerActivity.CUDA,
|
343
|
+
],
|
344
|
+
with_stack=True,
|
345
|
+
)
|
346
|
+
profiler.start()
|
347
|
+
|
311
348
|
# Prefill
|
312
349
|
synchronize(device)
|
313
350
|
tic = time.time()
|
@@ -338,6 +375,13 @@ def latency_test_run_once(
|
|
338
375
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
339
376
|
)
|
340
377
|
|
378
|
+
if profile:
|
379
|
+
profiler.stop()
|
380
|
+
profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}.trace.json.gz"
|
381
|
+
parent_dir = os.path.dirname(os.path.abspath(profile_filename))
|
382
|
+
os.makedirs(parent_dir, exist_ok=True)
|
383
|
+
profiler.export_chrome_trace(profile_filename)
|
384
|
+
|
341
385
|
# Record decode timing from 2nd output
|
342
386
|
if output_len > 1:
|
343
387
|
med_decode_latency = np.median(decode_latencies)
|
@@ -386,6 +430,8 @@ def latency_test(
|
|
386
430
|
bench_args.input_len[0],
|
387
431
|
8, # shorter decoding to speed up the warmup
|
388
432
|
server_args.device,
|
433
|
+
profile=False,
|
434
|
+
profile_filename_prefix="", # not used
|
389
435
|
)
|
390
436
|
|
391
437
|
rank_print("Benchmark ...")
|
@@ -405,6 +451,8 @@ def latency_test(
|
|
405
451
|
il,
|
406
452
|
ol,
|
407
453
|
server_args.device,
|
454
|
+
bench_args.profile,
|
455
|
+
bench_args.profile_filename_prefix,
|
408
456
|
)
|
409
457
|
if ret is not None:
|
410
458
|
result_list.append(ret)
|
@@ -22,7 +22,7 @@ from typing import Tuple
|
|
22
22
|
import numpy as np
|
23
23
|
import requests
|
24
24
|
|
25
|
-
from sglang.srt.
|
25
|
+
from sglang.srt.entrypoints.http_server import launch_server
|
26
26
|
from sglang.srt.server_args import ServerArgs
|
27
27
|
from sglang.srt.utils import kill_process_tree
|
28
28
|
|
@@ -452,6 +452,7 @@ def get_dataset(args, tokenizer):
|
|
452
452
|
num_requests=args.num_prompts,
|
453
453
|
tokenizer=tokenizer,
|
454
454
|
fixed_output_len=args.sharegpt_output_len,
|
455
|
+
context_len=args.sharegpt_context_len,
|
455
456
|
)
|
456
457
|
elif args.dataset_name == "random":
|
457
458
|
input_requests = sample_random_requests(
|
@@ -464,11 +465,11 @@ def get_dataset(args, tokenizer):
|
|
464
465
|
)
|
465
466
|
elif args.dataset_name == "generated-shared-prefix":
|
466
467
|
input_requests = sample_generated_shared_prefix_requests(
|
467
|
-
num_groups=args.
|
468
|
-
prompts_per_group=args.
|
469
|
-
system_prompt_len=args.
|
470
|
-
question_len=args.
|
471
|
-
output_len=args.
|
468
|
+
num_groups=args.gsp_num_groups,
|
469
|
+
prompts_per_group=args.gsp_prompts_per_group,
|
470
|
+
system_prompt_len=args.gsp_system_prompt_len,
|
471
|
+
question_len=args.gsp_question_len,
|
472
|
+
output_len=args.gsp_output_len,
|
472
473
|
tokenizer=tokenizer,
|
473
474
|
)
|
474
475
|
else:
|
@@ -560,6 +561,7 @@ def sample_sharegpt_requests(
|
|
560
561
|
num_requests: int,
|
561
562
|
tokenizer: PreTrainedTokenizerBase,
|
562
563
|
fixed_output_len: Optional[int] = None,
|
564
|
+
context_len: Optional[int] = None,
|
563
565
|
) -> List[Tuple[str, int, int]]:
|
564
566
|
if fixed_output_len is not None and fixed_output_len < 4:
|
565
567
|
raise ValueError("output_len too small")
|
@@ -597,14 +599,15 @@ def sample_sharegpt_requests(
|
|
597
599
|
output_len = (
|
598
600
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
599
601
|
)
|
600
|
-
|
602
|
+
|
603
|
+
if prompt_len < 1 or output_len < 1:
|
601
604
|
# Prune too short sequences.
|
602
605
|
continue
|
603
|
-
|
604
|
-
|
605
|
-
):
|
606
|
+
|
607
|
+
if context_len and prompt_len + output_len > context_len:
|
606
608
|
# Prune too long sequences.
|
607
609
|
continue
|
610
|
+
|
608
611
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
609
612
|
|
610
613
|
print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
|
@@ -706,8 +709,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
|
|
706
709
|
|
707
710
|
# Create a unique cache filename based on the generation parameters
|
708
711
|
cache_key = (
|
709
|
-
f"
|
710
|
-
f"{args.
|
712
|
+
f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
|
713
|
+
f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
|
711
714
|
f"{tokenizer.__class__.__name__}.pkl"
|
712
715
|
)
|
713
716
|
return cache_dir / cache_key
|
@@ -1374,6 +1377,12 @@ if __name__ == "__main__":
|
|
1374
1377
|
default=None,
|
1375
1378
|
help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
|
1376
1379
|
)
|
1380
|
+
parser.add_argument(
|
1381
|
+
"--sharegpt-context-len",
|
1382
|
+
type=int,
|
1383
|
+
default=None,
|
1384
|
+
help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
|
1385
|
+
)
|
1377
1386
|
parser.add_argument(
|
1378
1387
|
"--random-input-len",
|
1379
1388
|
type=int,
|
@@ -1453,49 +1462,49 @@ if __name__ == "__main__":
|
|
1453
1462
|
help="Append given JSON object to the request payload. You can use this to specify"
|
1454
1463
|
"additional generate params like sampling params.",
|
1455
1464
|
)
|
1465
|
+
parser.add_argument(
|
1466
|
+
"--profile",
|
1467
|
+
action="store_true",
|
1468
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
1469
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
1470
|
+
)
|
1471
|
+
parser.add_argument(
|
1472
|
+
"--lora-name",
|
1473
|
+
type=str,
|
1474
|
+
default=None,
|
1475
|
+
help="The name of LoRA adapter",
|
1476
|
+
)
|
1456
1477
|
|
1457
1478
|
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1458
1479
|
group.add_argument(
|
1459
|
-
"--
|
1480
|
+
"--gsp-num-groups",
|
1460
1481
|
type=int,
|
1461
1482
|
default=64,
|
1462
1483
|
help="Number of system prompt groups for generated-shared-prefix dataset",
|
1463
1484
|
)
|
1464
1485
|
group.add_argument(
|
1465
|
-
"--
|
1486
|
+
"--gsp-prompts-per-group",
|
1466
1487
|
type=int,
|
1467
1488
|
default=16,
|
1468
1489
|
help="Number of prompts per system prompt group for generated-shared-prefix dataset",
|
1469
1490
|
)
|
1470
1491
|
group.add_argument(
|
1471
|
-
"--
|
1492
|
+
"--gsp-system-prompt-len",
|
1472
1493
|
type=int,
|
1473
1494
|
default=2048,
|
1474
1495
|
help="Target length in tokens for system prompts in generated-shared-prefix dataset",
|
1475
1496
|
)
|
1476
1497
|
group.add_argument(
|
1477
|
-
"--
|
1498
|
+
"--gsp-question-len",
|
1478
1499
|
type=int,
|
1479
1500
|
default=128,
|
1480
1501
|
help="Target length in tokens for questions in generated-shared-prefix dataset",
|
1481
1502
|
)
|
1482
1503
|
group.add_argument(
|
1483
|
-
"--
|
1504
|
+
"--gsp-output-len",
|
1484
1505
|
type=int,
|
1485
1506
|
default=256,
|
1486
1507
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1487
1508
|
)
|
1488
|
-
parser.add_argument(
|
1489
|
-
"--profile",
|
1490
|
-
action="store_true",
|
1491
|
-
help="Use Torch Profiler. The endpoint must be launched with "
|
1492
|
-
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
1493
|
-
)
|
1494
|
-
parser.add_argument(
|
1495
|
-
"--lora-name",
|
1496
|
-
type=str,
|
1497
|
-
default=None,
|
1498
|
-
help="The name of LoRA adapter",
|
1499
|
-
)
|
1500
1509
|
args = parser.parse_args()
|
1501
1510
|
run_benchmark(args)
|