sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -85,6 +85,22 @@ class RuntimeEndpoint(BaseBackend):
|
|
85
85
|
)
|
86
86
|
self._assert_success(res)
|
87
87
|
|
88
|
+
def start_profile(self):
|
89
|
+
res = http_request(
|
90
|
+
self.base_url + "/start_profile",
|
91
|
+
api_key=self.api_key,
|
92
|
+
verify=self.verify,
|
93
|
+
)
|
94
|
+
self._assert_success(res)
|
95
|
+
|
96
|
+
def stop_profile(self):
|
97
|
+
res = http_request(
|
98
|
+
self.base_url + "/stop_profile",
|
99
|
+
api_key=self.api_key,
|
100
|
+
verify=self.verify,
|
101
|
+
)
|
102
|
+
self._assert_success(res)
|
103
|
+
|
88
104
|
def commit_lazy_operations(self, s: StreamExecutor):
|
89
105
|
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
90
106
|
self._add_images(s, data)
|
@@ -374,7 +390,8 @@ class Runtime:
|
|
374
390
|
self.pid = None
|
375
391
|
pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
|
376
392
|
|
377
|
-
|
393
|
+
ctx = multiprocessing.get_context("spawn")
|
394
|
+
proc = ctx.Process(
|
378
395
|
target=launch_server,
|
379
396
|
args=(self.server_args, pipe_writer),
|
380
397
|
)
|
@@ -406,6 +423,12 @@ class Runtime:
|
|
406
423
|
kill_process_tree(self.pid)
|
407
424
|
self.pid = None
|
408
425
|
|
426
|
+
def start_profile(self):
|
427
|
+
self.endpoint.start_profile()
|
428
|
+
|
429
|
+
def stop_profile(self):
|
430
|
+
self.endpoint.stop_profile()
|
431
|
+
|
409
432
|
def cache_prefix(self, prefix: str):
|
410
433
|
self.endpoint.cache_prefix(prefix)
|
411
434
|
|
sglang/profiler.py
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
"""
|
2
|
+
Run live profiling.
|
3
|
+
|
4
|
+
Usage:
|
5
|
+
python3 -m sglang.profiler
|
6
|
+
"""
|
7
|
+
|
8
|
+
import argparse
|
9
|
+
import json
|
10
|
+
import os
|
11
|
+
import time
|
12
|
+
import urllib.parse
|
13
|
+
from argparse import ArgumentParser
|
14
|
+
from pathlib import Path
|
15
|
+
from typing import List, Optional
|
16
|
+
|
17
|
+
import requests
|
18
|
+
|
19
|
+
PARENT_FOLDER = "/tmp/sglang-profile"
|
20
|
+
|
21
|
+
|
22
|
+
def _run_profile(
|
23
|
+
url: Optional[str],
|
24
|
+
num_steps: int,
|
25
|
+
activities: List[str],
|
26
|
+
output_dir: Optional[str] = None,
|
27
|
+
profile_name: Optional[str] = None,
|
28
|
+
profile_by_stage: bool = False,
|
29
|
+
) -> str:
|
30
|
+
if output_dir is None:
|
31
|
+
output_dir = PARENT_FOLDER
|
32
|
+
|
33
|
+
output_dir = os.path.normpath(output_dir)
|
34
|
+
output_dir = os.path.abspath(output_dir)
|
35
|
+
output_dir = Path(output_dir)
|
36
|
+
|
37
|
+
# Add "profile_name/timestamp" to the path.
|
38
|
+
if profile_name:
|
39
|
+
output_dir = output_dir / profile_name
|
40
|
+
output_dir = output_dir / str(time.time())
|
41
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
42
|
+
|
43
|
+
print(f"Dump profiling traces to {output_dir}")
|
44
|
+
print(
|
45
|
+
f"Waiting for {num_steps} steps and the trace to be flushed.... ({profile_by_stage=})"
|
46
|
+
)
|
47
|
+
|
48
|
+
# Dump server args.
|
49
|
+
file_path = Path(output_dir) / "server_args.json"
|
50
|
+
if not file_path.exists():
|
51
|
+
response = requests.get(url + "/get_server_info")
|
52
|
+
response.raise_for_status()
|
53
|
+
server_args_data = response.json()
|
54
|
+
with open(file_path, "w") as file:
|
55
|
+
file.write(json.dumps(server_args_data))
|
56
|
+
|
57
|
+
# Start profiler. The API replies when all steps are processed
|
58
|
+
# and files are generated.
|
59
|
+
json_data = {
|
60
|
+
"output_dir": str(output_dir),
|
61
|
+
"num_steps": str(num_steps),
|
62
|
+
"activities": activities,
|
63
|
+
"profile_by_stage": profile_by_stage,
|
64
|
+
}
|
65
|
+
|
66
|
+
response = requests.post(url=url + "/start_profile", json=json_data)
|
67
|
+
response.raise_for_status()
|
68
|
+
|
69
|
+
trace_link = str(output_dir)
|
70
|
+
return trace_link
|
71
|
+
|
72
|
+
|
73
|
+
def run_profile(
|
74
|
+
url: Optional[str],
|
75
|
+
num_steps: int,
|
76
|
+
activities: List[str],
|
77
|
+
output_dir: Optional[str] = None,
|
78
|
+
profile_name: Optional[str] = None,
|
79
|
+
profile_by_stage: bool = False,
|
80
|
+
):
|
81
|
+
# step based profile will self terminate on num_steps constraints
|
82
|
+
link = _run_profile(
|
83
|
+
url, num_steps, activities, output_dir, profile_name, profile_by_stage
|
84
|
+
)
|
85
|
+
return link
|
86
|
+
|
87
|
+
|
88
|
+
if __name__ == "__main__":
|
89
|
+
parser = ArgumentParser(description="Benchmark the online serving throughput.")
|
90
|
+
parser.add_argument(
|
91
|
+
"--url",
|
92
|
+
type=str,
|
93
|
+
default="http://localhost:30000",
|
94
|
+
help="Server or API base url if not using http host and port.",
|
95
|
+
)
|
96
|
+
parser.add_argument(
|
97
|
+
"--output-dir",
|
98
|
+
type=str,
|
99
|
+
default=None,
|
100
|
+
help="Profile directory to dump profile traces.",
|
101
|
+
)
|
102
|
+
parser.add_argument(
|
103
|
+
"--profile-name",
|
104
|
+
type=str,
|
105
|
+
default=None,
|
106
|
+
help="The name of this profile run.",
|
107
|
+
)
|
108
|
+
parser.add_argument(
|
109
|
+
"--num-steps",
|
110
|
+
type=int,
|
111
|
+
default=5,
|
112
|
+
help="The number of forward steps to profile.",
|
113
|
+
)
|
114
|
+
parser.add_argument(
|
115
|
+
"--profile-by-stage",
|
116
|
+
action=argparse.BooleanOptionalAction,
|
117
|
+
type=bool,
|
118
|
+
default=False,
|
119
|
+
help="The number of forward steps to profile.",
|
120
|
+
)
|
121
|
+
parser.add_argument(
|
122
|
+
"--cpu",
|
123
|
+
action=argparse.BooleanOptionalAction,
|
124
|
+
type=bool,
|
125
|
+
default=True,
|
126
|
+
help="Whether to profile CPU activity",
|
127
|
+
)
|
128
|
+
parser.add_argument(
|
129
|
+
"--gpu",
|
130
|
+
action=argparse.BooleanOptionalAction,
|
131
|
+
type=bool,
|
132
|
+
default=True,
|
133
|
+
help="Whether to profile GPU activity",
|
134
|
+
)
|
135
|
+
parser.add_argument(
|
136
|
+
"--mem",
|
137
|
+
action=argparse.BooleanOptionalAction,
|
138
|
+
type=bool,
|
139
|
+
default=False,
|
140
|
+
help="Whether to memory usage (https://pytorch.org/memory_viz)",
|
141
|
+
)
|
142
|
+
parser.add_argument(
|
143
|
+
"--rpd",
|
144
|
+
action=argparse.BooleanOptionalAction,
|
145
|
+
type=bool,
|
146
|
+
default=False,
|
147
|
+
help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
|
148
|
+
)
|
149
|
+
|
150
|
+
args = parser.parse_args()
|
151
|
+
activities = []
|
152
|
+
if args.cpu:
|
153
|
+
activities.append("CPU")
|
154
|
+
if args.gpu:
|
155
|
+
activities.append("GPU")
|
156
|
+
if args.mem:
|
157
|
+
activities.append("MEM")
|
158
|
+
if args.rpd:
|
159
|
+
activities.append("RPD")
|
160
|
+
run_profile(
|
161
|
+
args.url,
|
162
|
+
args.num_steps,
|
163
|
+
activities,
|
164
|
+
args.output_dir,
|
165
|
+
args.profile_name,
|
166
|
+
args.profile_by_stage,
|
167
|
+
)
|
sglang/srt/_custom_ops.py
CHANGED
@@ -113,3 +113,37 @@ else:
|
|
113
113
|
|
114
114
|
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
115
115
|
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
116
|
+
|
117
|
+
|
118
|
+
def mscclpp_generate_unique_id() -> bytes:
|
119
|
+
return sgl_kernel.allreduce.mscclpp_generate_unique_id()
|
120
|
+
|
121
|
+
|
122
|
+
def mscclpp_init_context(
|
123
|
+
unique_id: bytes,
|
124
|
+
rank: int,
|
125
|
+
world_size: int,
|
126
|
+
scratch: torch.Tensor,
|
127
|
+
put_buffer: torch.Tensor,
|
128
|
+
nranks_per_node: int,
|
129
|
+
rank_to_node: List[int],
|
130
|
+
rank_to_ib: List[int],
|
131
|
+
context_selection: int,
|
132
|
+
) -> int:
|
133
|
+
return sgl_kernel.allreduce.mscclpp_init_context(
|
134
|
+
unique_id,
|
135
|
+
rank,
|
136
|
+
world_size,
|
137
|
+
scratch,
|
138
|
+
put_buffer,
|
139
|
+
nranks_per_node,
|
140
|
+
rank_to_node,
|
141
|
+
rank_to_ib,
|
142
|
+
context_selection,
|
143
|
+
)
|
144
|
+
|
145
|
+
|
146
|
+
def mscclpp_allreduce(
|
147
|
+
context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
|
148
|
+
) -> None:
|
149
|
+
return sgl_kernel.allreduce.mscclpp_allreduce(context, inp, out, nthreads, nblocks)
|
sglang/srt/configs/internvl.py
CHANGED
@@ -7,11 +7,8 @@ import sentencepiece as spm
|
|
7
7
|
from transformers import (
|
8
8
|
TOKENIZER_MAPPING,
|
9
9
|
LlamaConfig,
|
10
|
-
Phi3Config,
|
11
10
|
PretrainedConfig,
|
12
11
|
PreTrainedTokenizer,
|
13
|
-
PreTrainedTokenizerFast,
|
14
|
-
Qwen2Config,
|
15
12
|
)
|
16
13
|
|
17
14
|
from sglang.utils import logger
|
@@ -302,24 +299,23 @@ class InternVLChatConfig(PretrainedConfig):
|
|
302
299
|
)
|
303
300
|
|
304
301
|
if llm_config is None:
|
305
|
-
|
306
|
-
llm_config = {"architectures": [""]}
|
302
|
+
llm_config = {"architectures": ["InternLM2ForCausalLM"]}
|
307
303
|
logger.info(
|
308
304
|
"llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
|
309
305
|
)
|
306
|
+
|
310
307
|
self.vision_config = InternVisionConfig(**vision_config)
|
311
|
-
if llm_config
|
308
|
+
if llm_config.get("architectures")[0] == "LlamaForCausalLM":
|
312
309
|
self.llm_config = LlamaConfig(**llm_config)
|
313
|
-
elif llm_config
|
310
|
+
elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
|
314
311
|
self.llm_config = InternLM2Config(**llm_config)
|
315
|
-
elif llm_config["architectures"][0] == "Phi3ForCausalLM":
|
316
|
-
self.llm_config = Phi3Config(**llm_config)
|
317
|
-
elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
|
318
|
-
self.llm_config = Qwen2Config(**llm_config)
|
319
312
|
else:
|
320
313
|
raise ValueError(
|
321
|
-
"Unsupported architecture: {}".format(
|
314
|
+
"Unsupported architecture: {}".format(
|
315
|
+
llm_config.get("architectures")[0]
|
316
|
+
)
|
322
317
|
)
|
318
|
+
|
323
319
|
self.use_backbone_lora = use_backbone_lora
|
324
320
|
self.use_llm_lora = use_llm_lora
|
325
321
|
self.pad2square = pad2square
|
@@ -16,13 +16,17 @@ import json
|
|
16
16
|
import logging
|
17
17
|
import math
|
18
18
|
import os
|
19
|
-
from enum import IntEnum, auto
|
19
|
+
from enum import Enum, IntEnum, auto
|
20
20
|
from typing import List, Optional, Set, Union
|
21
21
|
|
22
22
|
import torch
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
|
25
|
-
from sglang.srt.hf_transformers_utils import
|
25
|
+
from sglang.srt.hf_transformers_utils import (
|
26
|
+
get_config,
|
27
|
+
get_context_length,
|
28
|
+
get_hf_text_config,
|
29
|
+
)
|
26
30
|
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
27
31
|
from sglang.srt.server_args import ServerArgs
|
28
32
|
from sglang.srt.utils import get_bool_env_var, is_hip
|
@@ -35,6 +39,12 @@ class AttentionArch(IntEnum):
|
|
35
39
|
MHA = auto()
|
36
40
|
|
37
41
|
|
42
|
+
class ModelImpl(str, Enum):
|
43
|
+
AUTO = "auto"
|
44
|
+
SGLANG = "sglang"
|
45
|
+
TRANSFORMERS = "transformers"
|
46
|
+
|
47
|
+
|
38
48
|
class ModelConfig:
|
39
49
|
def __init__(
|
40
50
|
self,
|
@@ -49,11 +59,13 @@ class ModelConfig:
|
|
49
59
|
quantization: Optional[str] = None,
|
50
60
|
override_config_file: Optional[str] = None,
|
51
61
|
is_draft_model: bool = False,
|
62
|
+
impl: Union[str, ModelImpl] = ModelImpl.AUTO,
|
52
63
|
) -> None:
|
53
64
|
|
54
65
|
self.model_path = model_path
|
55
66
|
self.revision = revision
|
56
67
|
self.quantization = quantization
|
68
|
+
self.impl = impl
|
57
69
|
|
58
70
|
# Parse args
|
59
71
|
self.maybe_pull_model_tokenizer_from_remote()
|
@@ -69,6 +81,7 @@ class ModelConfig:
|
|
69
81
|
model_override_args=self.model_override_args,
|
70
82
|
**kwargs,
|
71
83
|
)
|
84
|
+
|
72
85
|
self.hf_text_config = get_hf_text_config(self.hf_config)
|
73
86
|
self.attention_chunk_size = getattr(
|
74
87
|
self.hf_text_config, "attention_chunk_size", None
|
@@ -93,6 +106,8 @@ class ModelConfig:
|
|
93
106
|
):
|
94
107
|
self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
|
95
108
|
|
109
|
+
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
110
|
+
self.hf_config.architectures[0] = "MiMoMTP"
|
96
111
|
# Check model type
|
97
112
|
self.is_generation = is_generation_model(
|
98
113
|
self.hf_config.architectures, is_embedding
|
@@ -109,6 +124,10 @@ class ModelConfig:
|
|
109
124
|
self.is_audio_model = enable_multimodal and is_audio_model(
|
110
125
|
self.hf_config.architectures
|
111
126
|
)
|
127
|
+
self.is_multimodal_chunked_prefill_supported = (
|
128
|
+
enable_multimodal
|
129
|
+
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
130
|
+
)
|
112
131
|
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
113
132
|
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
114
133
|
|
@@ -185,6 +204,22 @@ class ModelConfig:
|
|
185
204
|
self.v_head_dim = self.hf_text_config.v_head_dim
|
186
205
|
self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
|
187
206
|
else:
|
207
|
+
if (
|
208
|
+
"MistralModel" in self.hf_config.architectures
|
209
|
+
or "MixtralForCausalLM" in self.hf_config.architectures
|
210
|
+
or "MistralForCausalLM" in self.hf_config.architectures
|
211
|
+
):
|
212
|
+
if getattr(self, "head_dim", None) is None:
|
213
|
+
self.head_dim = (
|
214
|
+
self.hf_config.hidden_size // self.hf_config.num_attention_heads
|
215
|
+
)
|
216
|
+
# In transformers==4.52.3, the head_dim is null in MistralConfig
|
217
|
+
if (
|
218
|
+
not hasattr(self.hf_text_config, "head_dim")
|
219
|
+
or self.hf_text_config.head_dim is None
|
220
|
+
):
|
221
|
+
setattr(self.hf_text_config, "head_dim", self.head_dim)
|
222
|
+
|
188
223
|
self.attention_arch = AttentionArch.MHA
|
189
224
|
|
190
225
|
self.num_attention_heads = self.hf_text_config.num_attention_heads
|
@@ -209,7 +244,13 @@ class ModelConfig:
|
|
209
244
|
|
210
245
|
# Cache attributes
|
211
246
|
self.hf_eos_token_id = self.get_hf_eos_token_id()
|
212
|
-
|
247
|
+
|
248
|
+
config = self.hf_config
|
249
|
+
|
250
|
+
# multimodal
|
251
|
+
self.image_token_id = getattr(config, "image_token_id", None) or getattr(
|
252
|
+
config, "image_token_index", None
|
253
|
+
)
|
213
254
|
|
214
255
|
@staticmethod
|
215
256
|
def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
|
@@ -223,6 +264,7 @@ class ModelConfig:
|
|
223
264
|
enable_multimodal=server_args.enable_multimodal,
|
224
265
|
dtype=server_args.dtype,
|
225
266
|
quantization=server_args.quantization,
|
267
|
+
impl=server_args.impl,
|
226
268
|
**kwargs,
|
227
269
|
)
|
228
270
|
|
@@ -332,6 +374,7 @@ class ModelConfig:
|
|
332
374
|
"w8a8_int8",
|
333
375
|
"w8a8_fp8",
|
334
376
|
"moe_wna16",
|
377
|
+
"qoq",
|
335
378
|
]
|
336
379
|
compatible_quantization_methods = {
|
337
380
|
"modelopt_fp4": ["modelopt"],
|
@@ -423,31 +466,6 @@ class ModelConfig:
|
|
423
466
|
self.model_path = client.get_local_dir()
|
424
467
|
|
425
468
|
|
426
|
-
def get_hf_text_config(config: PretrainedConfig):
|
427
|
-
"""Get the "sub" config relevant to llm for multi modal models.
|
428
|
-
No op for pure text models.
|
429
|
-
"""
|
430
|
-
class_name = config.architectures[0]
|
431
|
-
if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
|
432
|
-
# We support non-hf version of llava models, so we do not want to
|
433
|
-
# read the wrong values from the unused default text_config.
|
434
|
-
# NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
|
435
|
-
# `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
|
436
|
-
setattr(config, "torch_dtype", torch.float16)
|
437
|
-
return config
|
438
|
-
|
439
|
-
if hasattr(config, "text_config"):
|
440
|
-
# The code operates under the assumption that text_config should have
|
441
|
-
# `num_attention_heads` (among others). Assert here to fail early
|
442
|
-
# if transformers config doesn't align with this assumption.
|
443
|
-
assert hasattr(config.text_config, "num_attention_heads")
|
444
|
-
return config.text_config
|
445
|
-
if hasattr(config, "language_config"):
|
446
|
-
return config.language_config
|
447
|
-
else:
|
448
|
-
return config
|
449
|
-
|
450
|
-
|
451
469
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
452
470
|
_STR_DTYPE_TO_TORCH_DTYPE = {
|
453
471
|
"half": torch.float16,
|
@@ -466,6 +484,8 @@ def _get_and_verify_dtype(
|
|
466
484
|
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
467
485
|
# because config.torch_dtype can be None.
|
468
486
|
config_dtype = getattr(config, "torch_dtype", None)
|
487
|
+
if isinstance(config_dtype, str):
|
488
|
+
config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
|
469
489
|
if config_dtype is None:
|
470
490
|
config_dtype = torch.float32
|
471
491
|
|
@@ -537,6 +557,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
537
557
|
|
538
558
|
|
539
559
|
multimodal_model_archs = [
|
560
|
+
"CLIPModel",
|
540
561
|
"DeepseekVL2ForCausalLM",
|
541
562
|
"Gemma3ForConditionalGeneration",
|
542
563
|
"Grok1VForCausalLM",
|
@@ -549,13 +570,14 @@ multimodal_model_archs = [
|
|
549
570
|
"LlavaVidForCausalLM",
|
550
571
|
"MiniCPMO",
|
551
572
|
"MiniCPMV",
|
573
|
+
"Mistral3ForConditionalGeneration",
|
552
574
|
"MultiModalityCausalLM",
|
553
575
|
"MllamaForConditionalGeneration",
|
554
576
|
"Qwen2VLForConditionalGeneration",
|
555
577
|
"Qwen2_5_VLForConditionalGeneration",
|
556
|
-
"CLIPModel",
|
557
578
|
"KimiVLForConditionalGeneration",
|
558
579
|
"InternVLChatModel",
|
580
|
+
"Phi4MMForCausalLM",
|
559
581
|
]
|
560
582
|
|
561
583
|
|
@@ -585,6 +607,21 @@ def is_encoder_decoder_model(model_architectures: List[str]):
|
|
585
607
|
return "MllamaForConditionalGeneration" in model_architectures
|
586
608
|
|
587
609
|
|
610
|
+
def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
|
611
|
+
"""Check if chunked prefill is supported for a MultiModal model."""
|
612
|
+
unsupported = [
|
613
|
+
"Grok1VForCausalLM",
|
614
|
+
"Grok1AForCausalLM",
|
615
|
+
"LlavaLlamaForCausalLM",
|
616
|
+
"MllamaForConditionalGeneration",
|
617
|
+
"CLIPModel",
|
618
|
+
]
|
619
|
+
if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
|
620
|
+
return False
|
621
|
+
else:
|
622
|
+
return True
|
623
|
+
|
624
|
+
|
588
625
|
def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
|
589
626
|
if scale <= 1:
|
590
627
|
return 1.0
|
@@ -60,7 +60,7 @@ class BaseGrammarObject:
|
|
60
60
|
raise NotImplementedError()
|
61
61
|
|
62
62
|
def copy(self) -> "BaseGrammarObject":
|
63
|
-
|
63
|
+
return self
|
64
64
|
|
65
65
|
@property
|
66
66
|
def finished(self):
|
@@ -99,9 +99,12 @@ class BaseGrammarObject:
|
|
99
99
|
raise NotImplementedError()
|
100
100
|
|
101
101
|
|
102
|
+
INVALID_GRAMMAR_OBJ = BaseGrammarObject()
|
103
|
+
|
104
|
+
|
102
105
|
@dataclass
|
103
106
|
class CacheEntry:
|
104
|
-
value:
|
107
|
+
value: BaseGrammarObject
|
105
108
|
event: Event
|
106
109
|
|
107
110
|
|
@@ -28,6 +28,7 @@ from llguidance.torch import (
|
|
28
28
|
)
|
29
29
|
|
30
30
|
from sglang.srt.constrained.base_grammar_backend import (
|
31
|
+
INVALID_GRAMMAR_OBJ,
|
31
32
|
BaseGrammarBackend,
|
32
33
|
BaseGrammarObject,
|
33
34
|
)
|
@@ -126,8 +127,8 @@ class GuidanceBackend(BaseGrammarBackend):
|
|
126
127
|
serialized_grammar=serialized_grammar,
|
127
128
|
)
|
128
129
|
except Exception as e:
|
129
|
-
logger.
|
130
|
-
return
|
130
|
+
logger.error(f"Hit invalid grammar: {serialized_grammar=}, {e=}")
|
131
|
+
return INVALID_GRAMMAR_OBJ
|
131
132
|
|
132
133
|
def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
|
133
134
|
try:
|
@@ -138,8 +139,8 @@ class GuidanceBackend(BaseGrammarBackend):
|
|
138
139
|
},
|
139
140
|
)
|
140
141
|
except Exception as e:
|
141
|
-
logger.
|
142
|
-
return
|
142
|
+
logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
|
143
|
+
return INVALID_GRAMMAR_OBJ
|
143
144
|
return self._from_serialized(serialized_grammar)
|
144
145
|
|
145
146
|
def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
|
@@ -151,8 +152,8 @@ class GuidanceBackend(BaseGrammarBackend):
|
|
151
152
|
serialized_grammar = grammar_from("ebnf", key_string)
|
152
153
|
return self._from_serialized(serialized_grammar)
|
153
154
|
except ValueError as e:
|
154
|
-
logger.
|
155
|
-
return
|
155
|
+
logger.error(f"Hit invalid ebnf: {key_string=}, {e=}")
|
156
|
+
return INVALID_GRAMMAR_OBJ
|
156
157
|
|
157
158
|
def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
|
158
159
|
try:
|
@@ -169,5 +170,5 @@ class GuidanceBackend(BaseGrammarBackend):
|
|
169
170
|
g = StructTag.to_grammar(tags)
|
170
171
|
return self._from_serialized(g)
|
171
172
|
except Exception as e:
|
172
|
-
logging.
|
173
|
-
return
|
173
|
+
logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
|
174
|
+
return INVALID_GRAMMAR_OBJ
|
@@ -24,6 +24,7 @@ from outlines.models.transformers import TransformerTokenizer
|
|
24
24
|
from pydantic import BaseModel
|
25
25
|
|
26
26
|
from sglang.srt.constrained.base_grammar_backend import (
|
27
|
+
INVALID_GRAMMAR_OBJ,
|
27
28
|
BaseGrammarBackend,
|
28
29
|
BaseGrammarObject,
|
29
30
|
)
|
@@ -151,8 +152,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
|
|
151
152
|
# outlines <= 0.0.46
|
152
153
|
guide = RegexGuide(regex, self.outlines_tokenizer)
|
153
154
|
except interegular.patterns.InvalidSyntax as e:
|
154
|
-
logger.
|
155
|
-
return
|
155
|
+
logger.error(f"Hit invalid regex schema: {regex=}, {e=}")
|
156
|
+
return INVALID_GRAMMAR_OBJ
|
156
157
|
|
157
158
|
jump_forward_map = None
|
158
159
|
return OutlinesGrammar(guide, jump_forward_map)
|
@@ -170,8 +171,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
|
|
170
171
|
whitespace_pattern=self.whitespace_pattern,
|
171
172
|
)
|
172
173
|
except (NotImplementedError, json.decoder.JSONDecodeError, ValueError) as e:
|
173
|
-
logger.
|
174
|
-
return
|
174
|
+
logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
|
175
|
+
return INVALID_GRAMMAR_OBJ
|
175
176
|
return self._compile_regex(regex)
|
176
177
|
|
177
178
|
def dispatch_regex(self, key_string: str):
|
@@ -28,6 +28,7 @@ from xgrammar import (
|
|
28
28
|
)
|
29
29
|
|
30
30
|
from sglang.srt.constrained.base_grammar_backend import (
|
31
|
+
INVALID_GRAMMAR_OBJ,
|
31
32
|
BaseGrammarBackend,
|
32
33
|
BaseGrammarObject,
|
33
34
|
)
|
@@ -152,10 +153,11 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
152
153
|
):
|
153
154
|
super().__init__()
|
154
155
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
156
|
+
if True:
|
157
|
+
tokenizer_info = TokenizerInfo.from_huggingface(
|
158
|
+
tokenizer, vocab_size=vocab_size
|
159
|
+
)
|
160
|
+
override_stop_tokens = None
|
159
161
|
|
160
162
|
self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
|
161
163
|
self.vocab_size = vocab_size
|
@@ -178,25 +180,26 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
178
180
|
ctx = self.grammar_compiler.compile_builtin_json_grammar()
|
179
181
|
else:
|
180
182
|
ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
|
181
|
-
|
182
|
-
|
183
|
-
|
183
|
+
|
184
|
+
except (RuntimeError, json.decoder.JSONDecodeError) as e:
|
185
|
+
logging.error(f"Hit invalid json_schema: {key_string=}, {e=}")
|
186
|
+
return INVALID_GRAMMAR_OBJ
|
184
187
|
return self._from_context(ctx, key_string)
|
185
188
|
|
186
189
|
def dispatch_ebnf(self, key_string: str) -> Optional[XGrammarGrammar]:
|
187
190
|
try:
|
188
191
|
ctx = self.grammar_compiler.compile_grammar(key_string)
|
189
192
|
except RuntimeError as e:
|
190
|
-
logging.
|
191
|
-
return
|
193
|
+
logging.error(f"Hit invalid ebnf: {key_string=}, {e=}")
|
194
|
+
return INVALID_GRAMMAR_OBJ
|
192
195
|
return self._from_context(ctx, key_string)
|
193
196
|
|
194
197
|
def dispatch_regex(self, key_string: str) -> Optional[XGrammarGrammar]:
|
195
198
|
try:
|
196
199
|
ctx = self.grammar_compiler.compile_regex(key_string)
|
197
200
|
except RuntimeError as e:
|
198
|
-
logging.
|
199
|
-
return
|
201
|
+
logging.error(f"Hit invalid regex: {key_string=}, {e=}")
|
202
|
+
return INVALID_GRAMMAR_OBJ
|
200
203
|
return self._from_context(ctx, key_string)
|
201
204
|
|
202
205
|
def dispatch_structural_tag(self, key_string: str) -> Optional[XGrammarGrammar]:
|
@@ -213,13 +216,10 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
213
216
|
ctx = self.grammar_compiler.compile_structural_tag(
|
214
217
|
tags, structural_tag["triggers"]
|
215
218
|
)
|
216
|
-
except RuntimeError as e:
|
217
|
-
logging.
|
218
|
-
|
219
|
-
)
|
220
|
-
return None
|
219
|
+
except (RuntimeError, json.decoder.JSONDecodeError) as e:
|
220
|
+
logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
|
221
|
+
return INVALID_GRAMMAR_OBJ
|
221
222
|
return self._from_context(ctx, key_string)
|
222
223
|
|
223
224
|
def reset(self):
|
224
|
-
|
225
|
-
self.grammar_compiler.clear_cache()
|
225
|
+
self.grammar_compiler.clear_cache()
|