sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -54,7 +54,7 @@ class SessionReqNode:
|
|
54
54
|
prefix += " -- " + self.childs[0].req.rid
|
55
55
|
ret = self.childs[0]._str_helper(prefix)
|
56
56
|
for child in self.childs[1:]:
|
57
|
-
prefix = " " * len(origin_prefix) + " \- " + child.req.rid
|
57
|
+
prefix = " " * len(origin_prefix) + r" \- " + child.req.rid
|
58
58
|
ret += child._str_helper(prefix)
|
59
59
|
return ret
|
60
60
|
|
@@ -16,7 +16,9 @@
|
|
16
16
|
import asyncio
|
17
17
|
import copy
|
18
18
|
import dataclasses
|
19
|
+
import json
|
19
20
|
import logging
|
21
|
+
import math
|
20
22
|
import os
|
21
23
|
import pickle
|
22
24
|
import signal
|
@@ -41,6 +43,7 @@ from typing import (
|
|
41
43
|
)
|
42
44
|
|
43
45
|
import fastapi
|
46
|
+
import torch
|
44
47
|
import uvloop
|
45
48
|
import zmq
|
46
49
|
import zmq.asyncio
|
@@ -90,6 +93,8 @@ from sglang.srt.managers.io_struct import (
|
|
90
93
|
ResumeMemoryOccupationReqInput,
|
91
94
|
ResumeMemoryOccupationReqOutput,
|
92
95
|
SessionParams,
|
96
|
+
SetInternalStateReq,
|
97
|
+
SetInternalStateReqOutput,
|
93
98
|
SlowDownReqInput,
|
94
99
|
SlowDownReqOutput,
|
95
100
|
TokenizedEmbeddingReqInput,
|
@@ -111,6 +116,7 @@ from sglang.srt.sampling.sampling_params import SamplingParams
|
|
111
116
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
112
117
|
from sglang.srt.utils import (
|
113
118
|
dataclass_to_string_truncated,
|
119
|
+
get_bool_env_var,
|
114
120
|
get_zmq_socket,
|
115
121
|
kill_process_tree,
|
116
122
|
)
|
@@ -169,6 +175,11 @@ class TokenizerManager:
|
|
169
175
|
self.enable_metrics = server_args.enable_metrics
|
170
176
|
self.log_requests = server_args.log_requests
|
171
177
|
self.log_requests_level = server_args.log_requests_level
|
178
|
+
self.preferred_sampling_params = (
|
179
|
+
json.loads(server_args.preferred_sampling_params)
|
180
|
+
if server_args.preferred_sampling_params
|
181
|
+
else None
|
182
|
+
)
|
172
183
|
|
173
184
|
# Init inter-process communication
|
174
185
|
context = zmq.asyncio.Context(2)
|
@@ -213,7 +224,7 @@ class TokenizerManager:
|
|
213
224
|
self.tokenizer = get_tokenizer_from_processor(self.processor)
|
214
225
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
215
226
|
else:
|
216
|
-
self.mm_processor =
|
227
|
+
self.mm_processor = None
|
217
228
|
|
218
229
|
if server_args.skip_tokenizer_init:
|
219
230
|
self.tokenizer = self.processor = None
|
@@ -228,6 +239,7 @@ class TokenizerManager:
|
|
228
239
|
# Store states
|
229
240
|
self.no_create_loop = False
|
230
241
|
self.rid_to_state: Dict[str, ReqState] = {}
|
242
|
+
self.health_check_failed = False
|
231
243
|
self.gracefully_exit = False
|
232
244
|
self.last_receive_tstamp = 0
|
233
245
|
self.dump_requests_folder = "" # By default do not dump
|
@@ -255,6 +267,10 @@ class TokenizerManager:
|
|
255
267
|
"model_name": self.server_args.served_model_name,
|
256
268
|
# TODO: Add lora name/path in the future,
|
257
269
|
},
|
270
|
+
bucket_time_to_first_token=self.server_args.bucket_time_to_first_token,
|
271
|
+
bucket_e2e_request_latency=self.server_args.bucket_e2e_request_latency,
|
272
|
+
bucket_inter_token_latency=self.server_args.bucket_inter_token_latency,
|
273
|
+
collect_tokens_histogram=self.server_args.collect_tokens_histogram,
|
258
274
|
)
|
259
275
|
|
260
276
|
# Communicators
|
@@ -282,12 +298,16 @@ class TokenizerManager:
|
|
282
298
|
self.flush_cache_communicator = _Communicator(
|
283
299
|
self.send_to_scheduler, server_args.dp_size
|
284
300
|
)
|
285
|
-
self.
|
301
|
+
self.profile_communicator = _Communicator(
|
286
302
|
self.send_to_scheduler, server_args.dp_size
|
287
303
|
)
|
304
|
+
self.health_check_communitcator = _Communicator(self.send_to_scheduler, 1)
|
288
305
|
self.get_internal_state_communicator = _Communicator(
|
289
306
|
self.send_to_scheduler, server_args.dp_size
|
290
307
|
)
|
308
|
+
self.set_internal_state_communicator = _Communicator(
|
309
|
+
self.send_to_scheduler, server_args.dp_size
|
310
|
+
)
|
291
311
|
self.expert_distribution_communicator = _Communicator(
|
292
312
|
self.send_to_scheduler, server_args.dp_size
|
293
313
|
)
|
@@ -343,12 +363,16 @@ class TokenizerManager:
|
|
343
363
|
),
|
344
364
|
(
|
345
365
|
ProfileReqOutput,
|
346
|
-
self.
|
366
|
+
self.profile_communicator.handle_recv,
|
347
367
|
),
|
348
368
|
(
|
349
369
|
GetInternalStateReqOutput,
|
350
370
|
self.get_internal_state_communicator.handle_recv,
|
351
371
|
),
|
372
|
+
(
|
373
|
+
SetInternalStateReqOutput,
|
374
|
+
self.set_internal_state_communicator.handle_recv,
|
375
|
+
),
|
352
376
|
(
|
353
377
|
ExpertDistributionReqOutput,
|
354
378
|
self.expert_distribution_communicator.handle_recv,
|
@@ -374,6 +398,9 @@ class TokenizerManager:
|
|
374
398
|
self.server_args.disaggregation_bootstrap_port
|
375
399
|
)
|
376
400
|
|
401
|
+
self.current_load = 0
|
402
|
+
self.current_load_lock = asyncio.Lock()
|
403
|
+
|
377
404
|
async def generate_request(
|
378
405
|
self,
|
379
406
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
@@ -401,8 +428,8 @@ class TokenizerManager:
|
|
401
428
|
is_single = obj.is_single
|
402
429
|
if is_single:
|
403
430
|
tokenized_obj = await self._tokenize_one_request(obj)
|
404
|
-
self._send_one_request(obj, tokenized_obj, created_time)
|
405
|
-
async for response in self._wait_one_response(obj, request):
|
431
|
+
state = self._send_one_request(obj, tokenized_obj, created_time)
|
432
|
+
async for response in self._wait_one_response(obj, state, request):
|
406
433
|
yield response
|
407
434
|
else:
|
408
435
|
async for response in self._handle_batch_request(
|
@@ -438,14 +465,17 @@ class TokenizerManager:
|
|
438
465
|
)
|
439
466
|
input_ids = self.tokenizer.encode(input_text)
|
440
467
|
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
468
|
+
if self.mm_processor and obj.contains_mm_input():
|
469
|
+
image_inputs = await self.mm_processor.process_mm_data_async(
|
470
|
+
image_data=obj.image_data,
|
471
|
+
input_text=input_text or input_ids,
|
472
|
+
request_obj=obj,
|
473
|
+
max_req_input_len=self.max_req_input_len,
|
474
|
+
)
|
475
|
+
if image_inputs and "input_ids" in image_inputs:
|
476
|
+
input_ids = image_inputs["input_ids"]
|
477
|
+
else:
|
478
|
+
image_inputs: Optional[Dict] = None
|
449
479
|
|
450
480
|
self._validate_token_len(obj, input_ids)
|
451
481
|
return self._create_tokenized_object(
|
@@ -508,7 +538,14 @@ class TokenizerManager:
|
|
508
538
|
"Please set `--enable-custom-logits-processor` to enable this feature."
|
509
539
|
)
|
510
540
|
|
511
|
-
|
541
|
+
# Parse sampling parameters
|
542
|
+
# Note: if there are preferred sampling params, we use them if they are not
|
543
|
+
# explicitly passed in sampling_params
|
544
|
+
if self.preferred_sampling_params:
|
545
|
+
sampling_kwargs = {**self.preferred_sampling_params, **obj.sampling_params}
|
546
|
+
else:
|
547
|
+
sampling_kwargs = obj.sampling_params
|
548
|
+
sampling_params = SamplingParams(**sampling_kwargs)
|
512
549
|
sampling_params.normalize(self.tokenizer)
|
513
550
|
sampling_params.verify()
|
514
551
|
|
@@ -533,6 +570,7 @@ class TokenizerManager:
|
|
533
570
|
session_params=session_params,
|
534
571
|
custom_logit_processor=obj.custom_logit_processor,
|
535
572
|
return_hidden_states=obj.return_hidden_states,
|
573
|
+
data_parallel_rank=obj.data_parallel_rank,
|
536
574
|
)
|
537
575
|
elif isinstance(obj, EmbeddingReqInput):
|
538
576
|
tokenized_obj = TokenizedEmbeddingReqInput(
|
@@ -598,15 +636,15 @@ class TokenizerManager:
|
|
598
636
|
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
599
637
|
state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
|
600
638
|
self.rid_to_state[obj.rid] = state
|
639
|
+
return state
|
601
640
|
|
602
641
|
async def _wait_one_response(
|
603
642
|
self,
|
604
643
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
644
|
+
state: ReqState,
|
605
645
|
request: Optional[fastapi.Request] = None,
|
606
646
|
):
|
607
647
|
"""Wait for the response of one request."""
|
608
|
-
state = self.rid_to_state[obj.rid]
|
609
|
-
|
610
648
|
while True:
|
611
649
|
try:
|
612
650
|
await asyncio.wait_for(state.event.wait(), timeout=4)
|
@@ -667,7 +705,6 @@ class TokenizerManager:
|
|
667
705
|
|
668
706
|
generators = []
|
669
707
|
rids = []
|
670
|
-
|
671
708
|
if getattr(obj, "parallel_sample_num", 1) == 1:
|
672
709
|
if self.server_args.enable_tokenizer_batch_encode:
|
673
710
|
# Validate batch tokenization constraints
|
@@ -677,16 +714,16 @@ class TokenizerManager:
|
|
677
714
|
|
678
715
|
for i, tokenized_obj in enumerate(tokenized_objs):
|
679
716
|
tmp_obj = obj[i]
|
680
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
681
|
-
generators.append(self._wait_one_response(tmp_obj, request))
|
717
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
718
|
+
generators.append(self._wait_one_response(tmp_obj, state, request))
|
682
719
|
rids.append(tmp_obj.rid)
|
683
720
|
else:
|
684
721
|
# Sequential tokenization and processing
|
685
722
|
for i in range(batch_size):
|
686
723
|
tmp_obj = obj[i]
|
687
724
|
tokenized_obj = await self._tokenize_one_request(tmp_obj)
|
688
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
689
|
-
generators.append(self._wait_one_response(tmp_obj, request))
|
725
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
726
|
+
generators.append(self._wait_one_response(tmp_obj, state, request))
|
690
727
|
rids.append(tmp_obj.rid)
|
691
728
|
else:
|
692
729
|
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
|
@@ -711,8 +748,8 @@ class TokenizerManager:
|
|
711
748
|
tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
|
712
749
|
tokenized_obj.sampling_params.max_new_tokens = 0
|
713
750
|
tokenized_obj.stream = False
|
714
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
715
|
-
await self._wait_one_response(tmp_obj, request).__anext__()
|
751
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
752
|
+
await self._wait_one_response(tmp_obj, state, request).__anext__()
|
716
753
|
|
717
754
|
# Expand requests, assign new rids for them, and send them
|
718
755
|
for i in range(batch_size):
|
@@ -720,8 +757,8 @@ class TokenizerManager:
|
|
720
757
|
tmp_obj = copy.copy(objs[i])
|
721
758
|
tokenized_obj = copy.copy(tokenized_objs[i])
|
722
759
|
tokenized_obj.rid = tmp_obj.regenerate_rid()
|
723
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
724
|
-
generators.append(self._wait_one_response(tmp_obj, request))
|
760
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
761
|
+
generators.append(self._wait_one_response(tmp_obj, state, request))
|
725
762
|
rids.append(tmp_obj.rid)
|
726
763
|
|
727
764
|
# Wait for all requests
|
@@ -757,6 +794,9 @@ class TokenizerManager:
|
|
757
794
|
req = AbortReq(rid)
|
758
795
|
self.send_to_scheduler.send_pyobj(req)
|
759
796
|
|
797
|
+
if self.enable_metrics:
|
798
|
+
self.metrics_collector.observe_one_aborted_request()
|
799
|
+
|
760
800
|
async def start_profile(
|
761
801
|
self,
|
762
802
|
output_dir: Optional[str] = None,
|
@@ -764,7 +804,11 @@ class TokenizerManager:
|
|
764
804
|
activities: Optional[List[str]] = None,
|
765
805
|
with_stack: Optional[bool] = None,
|
766
806
|
record_shapes: Optional[bool] = None,
|
807
|
+
profile_by_stage: bool = False,
|
767
808
|
):
|
809
|
+
self.auto_create_handle_loop()
|
810
|
+
env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
|
811
|
+
with_stack = False if with_stack is False or env_with_stack is False else True
|
768
812
|
req = ProfileReq(
|
769
813
|
type=ProfileReqType.START_PROFILE,
|
770
814
|
output_dir=output_dir,
|
@@ -772,24 +816,32 @@ class TokenizerManager:
|
|
772
816
|
activities=activities,
|
773
817
|
with_stack=with_stack,
|
774
818
|
record_shapes=record_shapes,
|
819
|
+
profile_by_stage=profile_by_stage,
|
775
820
|
profile_id=str(time.time()),
|
776
821
|
)
|
777
|
-
|
822
|
+
return await self._execute_profile(req)
|
823
|
+
|
824
|
+
async def stop_profile(self):
|
825
|
+
self.auto_create_handle_loop()
|
826
|
+
req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
|
827
|
+
return await self._execute_profile(req)
|
828
|
+
|
829
|
+
async def _execute_profile(self, req: ProfileReq):
|
830
|
+
result = (await self.profile_communicator(req))[0]
|
778
831
|
if not result.success:
|
779
832
|
raise RuntimeError(result.message)
|
780
833
|
return result
|
781
834
|
|
782
|
-
def stop_profile(self):
|
783
|
-
req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
|
784
|
-
self.send_to_scheduler.send_pyobj(req)
|
785
|
-
|
786
835
|
async def start_expert_distribution_record(self):
|
836
|
+
self.auto_create_handle_loop()
|
787
837
|
await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
|
788
838
|
|
789
839
|
async def stop_expert_distribution_record(self):
|
840
|
+
self.auto_create_handle_loop()
|
790
841
|
await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
|
791
842
|
|
792
843
|
async def dump_expert_distribution_record(self):
|
844
|
+
self.auto_create_handle_loop()
|
793
845
|
await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
|
794
846
|
|
795
847
|
async def update_weights_from_disk(
|
@@ -804,7 +856,7 @@ class TokenizerManager:
|
|
804
856
|
obj.load_format = self.server_args.load_format
|
805
857
|
logger.info("Start update_weights. Load format=%s", obj.load_format)
|
806
858
|
|
807
|
-
if True:
|
859
|
+
if True: # Keep this redundant check to simplify some internal code sync
|
808
860
|
# Hold the lock if it is not async. This means that weight sync
|
809
861
|
# cannot run while requests are in progress.
|
810
862
|
async with self.model_update_lock.writer_lock:
|
@@ -856,8 +908,8 @@ class TokenizerManager:
|
|
856
908
|
) -> Tuple[bool, str]:
|
857
909
|
self.auto_create_handle_loop()
|
858
910
|
assert (
|
859
|
-
self.server_args.dp_size == 1
|
860
|
-
), "dp_size must be for update weights from distributed"
|
911
|
+
self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
|
912
|
+
), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
|
861
913
|
|
862
914
|
# This means that weight sync
|
863
915
|
# cannot run while requests are in progress.
|
@@ -872,8 +924,8 @@ class TokenizerManager:
|
|
872
924
|
) -> Tuple[bool, str]:
|
873
925
|
self.auto_create_handle_loop()
|
874
926
|
assert (
|
875
|
-
self.server_args.dp_size == 1
|
876
|
-
), "dp_size must be 1 for update weights from
|
927
|
+
self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
|
928
|
+
), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
|
877
929
|
|
878
930
|
# This means that weight sync
|
879
931
|
# cannot run while requests are in progress.
|
@@ -946,6 +998,22 @@ class TokenizerManager:
|
|
946
998
|
# Many DP ranks
|
947
999
|
return [res.internal_state for res in responses]
|
948
1000
|
|
1001
|
+
async def get_load(self) -> dict:
|
1002
|
+
# TODO(lsyin): fake load report server
|
1003
|
+
if not self.current_load_lock.locked():
|
1004
|
+
async with self.current_load_lock:
|
1005
|
+
internal_state = await self.get_internal_state()
|
1006
|
+
self.current_load = internal_state[0]["load"]
|
1007
|
+
return {"load": self.current_load}
|
1008
|
+
|
1009
|
+
async def set_internal_state(
|
1010
|
+
self, obj: SetInternalStateReq
|
1011
|
+
) -> SetInternalStateReqOutput:
|
1012
|
+
responses: List[SetInternalStateReqOutput] = (
|
1013
|
+
await self.set_internal_state_communicator(obj)
|
1014
|
+
)
|
1015
|
+
return [res.internal_state for res in responses]
|
1016
|
+
|
949
1017
|
def get_log_request_metadata(self):
|
950
1018
|
max_length = None
|
951
1019
|
skip_names = None
|
@@ -1015,11 +1083,17 @@ class TokenizerManager:
|
|
1015
1083
|
loop.create_task(print_exception_wrapper(self.handle_loop))
|
1016
1084
|
)
|
1017
1085
|
|
1086
|
+
self.event_loop = loop
|
1087
|
+
|
1018
1088
|
# We cannot add signal handler when the tokenizer manager is not in
|
1019
1089
|
# the main thread due to the CPython limitation.
|
1020
1090
|
if threading.current_thread() is threading.main_thread():
|
1021
1091
|
signal_handler = SignalHandler(self)
|
1022
|
-
loop.add_signal_handler(signal.SIGTERM, signal_handler.
|
1092
|
+
loop.add_signal_handler(signal.SIGTERM, signal_handler.sigterm_handler)
|
1093
|
+
# Update the signal handler for the process. It overrides the sigquit handler in the launch phase.
|
1094
|
+
loop.add_signal_handler(
|
1095
|
+
signal.SIGQUIT, signal_handler.running_phase_sigquit_handler
|
1096
|
+
)
|
1023
1097
|
else:
|
1024
1098
|
logger.warning(
|
1025
1099
|
"Signal handler is not added because the tokenizer manager is "
|
@@ -1037,6 +1111,15 @@ class TokenizerManager:
|
|
1037
1111
|
# Drain requests
|
1038
1112
|
while True:
|
1039
1113
|
remain_num_req = len(self.rid_to_state)
|
1114
|
+
|
1115
|
+
if self.health_check_failed:
|
1116
|
+
# if health check failed, we should exit immediately
|
1117
|
+
logger.error(
|
1118
|
+
"Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",
|
1119
|
+
remain_num_req,
|
1120
|
+
)
|
1121
|
+
break
|
1122
|
+
|
1040
1123
|
logger.info(
|
1041
1124
|
f"Gracefully exiting... remaining number of requests {remain_num_req}"
|
1042
1125
|
)
|
@@ -1120,7 +1203,16 @@ class TokenizerManager:
|
|
1120
1203
|
"meta_info": meta_info,
|
1121
1204
|
}
|
1122
1205
|
elif isinstance(recv_obj, BatchMultimodalOut):
|
1123
|
-
|
1206
|
+
if isinstance(recv_obj.outputs[i], str):
|
1207
|
+
out_dict = {
|
1208
|
+
"text": recv_obj.outputs[i],
|
1209
|
+
"meta_info": meta_info,
|
1210
|
+
}
|
1211
|
+
else:
|
1212
|
+
out_dict = {
|
1213
|
+
"outputs": json.dumps(recv_obj.outputs[i]),
|
1214
|
+
"meta_info": meta_info,
|
1215
|
+
}
|
1124
1216
|
else:
|
1125
1217
|
assert isinstance(recv_obj, BatchEmbeddingOut)
|
1126
1218
|
out_dict = {
|
@@ -1331,7 +1423,7 @@ class TokenizerManager:
|
|
1331
1423
|
asyncio.create_task(asyncio.to_thread(background_task))
|
1332
1424
|
|
1333
1425
|
def _handle_abort_req(self, recv_obj):
|
1334
|
-
self.rid_to_state.pop(recv_obj.rid)
|
1426
|
+
self.rid_to_state.pop(recv_obj.rid, None)
|
1335
1427
|
|
1336
1428
|
def _handle_open_session_req_output(self, recv_obj):
|
1337
1429
|
self.session_futures[recv_obj.session_id].set_result(
|
@@ -1347,6 +1439,100 @@ class TokenizerManager:
|
|
1347
1439
|
if len(self.model_update_tmp) == self.server_args.dp_size:
|
1348
1440
|
self.model_update_result.set_result(self.model_update_tmp)
|
1349
1441
|
|
1442
|
+
async def score_request(
|
1443
|
+
self,
|
1444
|
+
query: Optional[Union[str, List[int]]] = None,
|
1445
|
+
items: Optional[Union[str, List[str], List[List[int]]]] = None,
|
1446
|
+
label_token_ids: Optional[List[int]] = None,
|
1447
|
+
apply_softmax: bool = False,
|
1448
|
+
item_first: bool = False,
|
1449
|
+
request: Optional[Any] = None,
|
1450
|
+
) -> List[List[float]]:
|
1451
|
+
"""
|
1452
|
+
See Engine.score() for more details.
|
1453
|
+
"""
|
1454
|
+
if label_token_ids is None:
|
1455
|
+
raise ValueError("label_token_ids must be provided")
|
1456
|
+
|
1457
|
+
if self.tokenizer is not None:
|
1458
|
+
vocab_size = self.tokenizer.vocab_size
|
1459
|
+
for token_id in label_token_ids:
|
1460
|
+
if token_id >= vocab_size:
|
1461
|
+
raise ValueError(
|
1462
|
+
f"Token ID {token_id} is out of vocabulary (vocab size: {vocab_size})"
|
1463
|
+
)
|
1464
|
+
|
1465
|
+
# Handle string or tokenized query/items
|
1466
|
+
if isinstance(query, str) and (
|
1467
|
+
isinstance(items, str)
|
1468
|
+
or (isinstance(items, list) and (not items or isinstance(items[0], str)))
|
1469
|
+
):
|
1470
|
+
# Both query and items are text
|
1471
|
+
items_list = [items] if isinstance(items, str) else items
|
1472
|
+
if item_first:
|
1473
|
+
prompts = [f"{item}{query}" for item in items_list]
|
1474
|
+
else:
|
1475
|
+
prompts = [f"{query}{item}" for item in items_list]
|
1476
|
+
batch_request = GenerateReqInput(
|
1477
|
+
text=prompts,
|
1478
|
+
return_logprob=True,
|
1479
|
+
token_ids_logprob=label_token_ids,
|
1480
|
+
stream=False,
|
1481
|
+
sampling_params={"max_new_tokens": 1},
|
1482
|
+
)
|
1483
|
+
elif (
|
1484
|
+
isinstance(query, list)
|
1485
|
+
and isinstance(items, list)
|
1486
|
+
and items
|
1487
|
+
and isinstance(items[0], list)
|
1488
|
+
):
|
1489
|
+
# Both query and items are token IDs
|
1490
|
+
if item_first:
|
1491
|
+
input_ids_list = [item + query for item in items]
|
1492
|
+
else:
|
1493
|
+
input_ids_list = [query + item for item in items]
|
1494
|
+
batch_request = GenerateReqInput(
|
1495
|
+
input_ids=input_ids_list,
|
1496
|
+
return_logprob=True,
|
1497
|
+
token_ids_logprob=label_token_ids,
|
1498
|
+
stream=False,
|
1499
|
+
sampling_params={"max_new_tokens": 1},
|
1500
|
+
)
|
1501
|
+
else:
|
1502
|
+
raise ValueError(
|
1503
|
+
"Invalid combination of query/items types for score_request."
|
1504
|
+
)
|
1505
|
+
|
1506
|
+
results = await self.generate_request(batch_request, request).__anext__()
|
1507
|
+
scores = []
|
1508
|
+
|
1509
|
+
for result in results:
|
1510
|
+
# Get logprobs for each token
|
1511
|
+
logprobs = {}
|
1512
|
+
for logprob, token_id, _ in result["meta_info"].get(
|
1513
|
+
"output_token_ids_logprobs", []
|
1514
|
+
)[0]:
|
1515
|
+
if token_id in label_token_ids:
|
1516
|
+
logprobs[token_id] = logprob
|
1517
|
+
|
1518
|
+
# Get scores in order of label_token_ids
|
1519
|
+
score_list = [
|
1520
|
+
logprobs.get(token_id, float("-inf")) for token_id in label_token_ids
|
1521
|
+
]
|
1522
|
+
|
1523
|
+
# Apply softmax to logprobs if needed
|
1524
|
+
if apply_softmax:
|
1525
|
+
score_list = torch.softmax(torch.tensor(score_list), dim=0).tolist()
|
1526
|
+
else:
|
1527
|
+
# Convert logprobs to probabilities if not using softmax
|
1528
|
+
score_list = [
|
1529
|
+
math.exp(x) if x != float("-inf") else 0.0 for x in score_list
|
1530
|
+
]
|
1531
|
+
|
1532
|
+
scores.append(score_list)
|
1533
|
+
|
1534
|
+
return scores
|
1535
|
+
|
1350
1536
|
|
1351
1537
|
async def print_exception_wrapper(func):
|
1352
1538
|
"""
|
@@ -1366,12 +1552,18 @@ class SignalHandler:
|
|
1366
1552
|
def __init__(self, tokenizer_manager: TokenizerManager):
|
1367
1553
|
self.tokenizer_manager = tokenizer_manager
|
1368
1554
|
|
1369
|
-
def
|
1555
|
+
def sigterm_handler(self, signum=None, frame=None):
|
1370
1556
|
logger.warning(
|
1371
1557
|
f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
|
1372
1558
|
)
|
1373
1559
|
self.tokenizer_manager.gracefully_exit = True
|
1374
1560
|
|
1561
|
+
def running_phase_sigquit_handler(self, signum=None, frame=None):
|
1562
|
+
logger.error(
|
1563
|
+
"Received sigquit from a child process. It usually means the child failed."
|
1564
|
+
)
|
1565
|
+
kill_process_tree(os.getpid())
|
1566
|
+
|
1375
1567
|
|
1376
1568
|
T = TypeVar("T")
|
1377
1569
|
|
sglang/srt/managers/utils.py
CHANGED
@@ -35,10 +35,6 @@ def validate_input_length(
|
|
35
35
|
f"the maximum allowed length ({max_req_input_len} tokens). "
|
36
36
|
f"Use a shorter input or enable --allow-auto-truncate."
|
37
37
|
)
|
38
|
-
logger.error(error_msg)
|
39
|
-
req.finished_reason = FINISH_ABORT(
|
40
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
41
|
-
)
|
42
38
|
return error_msg
|
43
39
|
|
44
40
|
return None
|
@@ -38,7 +38,9 @@ class ChunkCache(BasePrefixCache):
|
|
38
38
|
|
39
39
|
def cache_finished_req(self, req: Req):
|
40
40
|
kv_indices = self.req_to_token_pool.req_to_token[
|
41
|
-
req.req_pool_idx,
|
41
|
+
req.req_pool_idx,
|
42
|
+
# For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
|
43
|
+
: len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0),
|
42
44
|
]
|
43
45
|
self.req_to_token_pool.free(req.req_pool_idx)
|
44
46
|
self.token_to_kv_pool_allocator.free(kv_indices)
|
@@ -335,13 +335,13 @@ class HiRadixCache(RadixCache):
|
|
335
335
|
return value, last_node
|
336
336
|
|
337
337
|
def _match_prefix_helper(self, node: TreeNode, key: List):
|
338
|
-
node.last_access_time = time.
|
338
|
+
node.last_access_time = time.monotonic()
|
339
339
|
child_key = self.get_child_key_fn(key)
|
340
340
|
value = []
|
341
341
|
|
342
342
|
while len(key) > 0 and child_key in node.children.keys():
|
343
343
|
child = node.children[child_key]
|
344
|
-
child.last_access_time = time.
|
344
|
+
child.last_access_time = time.monotonic()
|
345
345
|
prefix_len = self.key_match_fn(child.key, key)
|
346
346
|
if prefix_len < len(child.key):
|
347
347
|
new_node = self._split_node(child.key, child, prefix_len)
|
@@ -386,7 +386,7 @@ class HiRadixCache(RadixCache):
|
|
386
386
|
return new_node
|
387
387
|
|
388
388
|
def _insert_helper(self, node: TreeNode, key: List, value):
|
389
|
-
node.last_access_time = time.
|
389
|
+
node.last_access_time = time.monotonic()
|
390
390
|
if len(key) == 0:
|
391
391
|
return 0
|
392
392
|
|
@@ -395,7 +395,7 @@ class HiRadixCache(RadixCache):
|
|
395
395
|
|
396
396
|
while len(key) > 0 and child_key in node.children.keys():
|
397
397
|
node = node.children[child_key]
|
398
|
-
node.last_access_time = time.
|
398
|
+
node.last_access_time = time.monotonic()
|
399
399
|
prefix_len = self.key_match_fn(node.key, key)
|
400
400
|
|
401
401
|
if prefix_len == len(node.key):
|