sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
25
25
|
class ExpertLocationDispatchInfo:
|
26
26
|
ep_dispatch_algorithm: Literal["static", "random"]
|
27
27
|
# (num_logical_experts,)
|
28
|
-
partial_logical_to_rank_dispatch_physical_map: torch.Tensor
|
28
|
+
partial_logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
|
29
29
|
# (num_logical_experts, X)
|
30
30
|
partial_logical_to_all_physical_map: torch.Tensor
|
31
31
|
# (num_logical_experts,)
|
@@ -42,9 +42,14 @@ class ExpertLocationDispatchInfo:
|
|
42
42
|
|
43
43
|
return cls(
|
44
44
|
ep_dispatch_algorithm=ep_dispatch_algorithm,
|
45
|
-
partial_logical_to_rank_dispatch_physical_map=
|
46
|
-
|
47
|
-
|
45
|
+
partial_logical_to_rank_dispatch_physical_map=(
|
46
|
+
expert_location_metadata.logical_to_rank_dispatch_physical_map[
|
47
|
+
layer_id, :
|
48
|
+
]
|
49
|
+
if expert_location_metadata.logical_to_rank_dispatch_physical_map
|
50
|
+
is not None
|
51
|
+
else None
|
52
|
+
),
|
48
53
|
partial_logical_to_all_physical_map=expert_location_metadata.logical_to_all_physical_map[
|
49
54
|
layer_id, :
|
50
55
|
],
|
@@ -55,6 +60,18 @@ class ExpertLocationDispatchInfo:
|
|
55
60
|
)
|
56
61
|
|
57
62
|
|
63
|
+
def transform_select_experts_inputs(
|
64
|
+
router_logits: torch.Tensor,
|
65
|
+
correction_bias: Optional[torch.Tensor],
|
66
|
+
info: Optional[ExpertLocationDispatchInfo],
|
67
|
+
):
|
68
|
+
if (info is not None) and (info.ep_dispatch_algorithm == "fake"):
|
69
|
+
router_logits = torch.randn_like(router_logits)
|
70
|
+
if correction_bias is not None:
|
71
|
+
correction_bias = torch.zeros_like(correction_bias)
|
72
|
+
return router_logits, correction_bias
|
73
|
+
|
74
|
+
|
58
75
|
def topk_ids_logical_to_physical(
|
59
76
|
topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
|
60
77
|
) -> torch.Tensor:
|
@@ -63,9 +80,9 @@ def topk_ids_logical_to_physical(
|
|
63
80
|
|
64
81
|
if info.ep_dispatch_algorithm == "static":
|
65
82
|
return _topk_ids_logical_to_physical_static(topk_ids, info)
|
66
|
-
if info.ep_dispatch_algorithm
|
83
|
+
if info.ep_dispatch_algorithm in ["dynamic", "fake"]:
|
67
84
|
return _topk_ids_logical_to_physical_dynamic(topk_ids, info)
|
68
|
-
raise NotImplementedError
|
85
|
+
raise NotImplementedError(f"Unknown algorithm {info.ep_dispatch_algorithm}")
|
69
86
|
|
70
87
|
|
71
88
|
def _topk_ids_logical_to_physical_static(
|
sglang/srt/managers/io_struct.py
CHANGED
@@ -20,7 +20,7 @@ import copy
|
|
20
20
|
import uuid
|
21
21
|
from dataclasses import dataclass, field
|
22
22
|
from enum import Enum
|
23
|
-
from typing import TYPE_CHECKING, Any, Dict, List,
|
23
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
24
24
|
|
25
25
|
from sglang.srt.mm_utils import has_valid_data
|
26
26
|
|
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
|
|
30
30
|
else:
|
31
31
|
Image = Any
|
32
32
|
|
33
|
-
from sglang.srt.managers.schedule_batch import BaseFinishReason
|
33
|
+
from sglang.srt.managers.schedule_batch import BaseFinishReason
|
34
34
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
35
35
|
|
36
36
|
|
@@ -87,7 +87,7 @@ class GenerateReqInput:
|
|
87
87
|
|
88
88
|
# The modalities of the image data [image, multi-images, video]
|
89
89
|
modalities: Optional[List[str]] = None
|
90
|
-
# LoRA
|
90
|
+
# The path to the LoRA
|
91
91
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
92
92
|
|
93
93
|
# Session info for continual prompting
|
@@ -99,13 +99,16 @@ class GenerateReqInput:
|
|
99
99
|
custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
|
100
100
|
|
101
101
|
# Whether to return hidden states
|
102
|
-
return_hidden_states: bool = False
|
102
|
+
return_hidden_states: Union[List[bool], bool] = False
|
103
103
|
|
104
104
|
# For disaggregated inference
|
105
105
|
bootstrap_host: Optional[Union[List[str], str]] = None
|
106
|
-
bootstrap_port: Optional[Union[List[int], int]] = None
|
106
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
107
107
|
bootstrap_room: Optional[Union[List[int], int]] = None
|
108
108
|
|
109
|
+
# For data parallel rank routing
|
110
|
+
data_parallel_rank: Optional[int] = None
|
111
|
+
|
109
112
|
def contains_mm_input(self) -> bool:
|
110
113
|
return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
|
111
114
|
|
@@ -406,7 +409,11 @@ class GenerateReqInput:
|
|
406
409
|
if self.custom_logit_processor is not None
|
407
410
|
else None
|
408
411
|
),
|
409
|
-
return_hidden_states=
|
412
|
+
return_hidden_states=(
|
413
|
+
self.return_hidden_states[i]
|
414
|
+
if isinstance(self.return_hidden_states, list)
|
415
|
+
else self.return_hidden_states
|
416
|
+
),
|
410
417
|
# if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
|
411
418
|
bootstrap_host=(
|
412
419
|
self.bootstrap_host[i] if self.bootstrap_host is not None else None
|
@@ -417,6 +424,9 @@ class GenerateReqInput:
|
|
417
424
|
bootstrap_room=(
|
418
425
|
self.bootstrap_room[i] if self.bootstrap_room is not None else None
|
419
426
|
),
|
427
|
+
data_parallel_rank=(
|
428
|
+
self.data_parallel_rank if self.data_parallel_rank is not None else None
|
429
|
+
),
|
420
430
|
)
|
421
431
|
|
422
432
|
|
@@ -464,11 +474,14 @@ class TokenizedGenerateReqInput:
|
|
464
474
|
bootstrap_port: Optional[int] = None
|
465
475
|
bootstrap_room: Optional[int] = None
|
466
476
|
|
477
|
+
# For data parallel rank routing
|
478
|
+
data_parallel_rank: Optional[int] = None
|
479
|
+
|
467
480
|
|
468
481
|
@dataclass
|
469
482
|
class EmbeddingReqInput:
|
470
483
|
# The input prompt. It can be a single prompt or a batch of prompts.
|
471
|
-
text: Optional[Union[List[str], str]] = None
|
484
|
+
text: Optional[Union[List[List[str]], List[str], str]] = None
|
472
485
|
# The image input. It can be an image instance, file name, URL, or base64 encoded string.
|
473
486
|
# Can be formatted as:
|
474
487
|
# - Single image for a single request
|
@@ -492,6 +505,8 @@ class EmbeddingReqInput:
|
|
492
505
|
log_metrics: bool = True
|
493
506
|
# The modalities of the image data [image, multi-images, video]
|
494
507
|
modalities: Optional[List[str]] = None
|
508
|
+
# For cross-encoder requests
|
509
|
+
is_cross_encoder_request: bool = False
|
495
510
|
|
496
511
|
def contains_mm_input(self) -> bool:
|
497
512
|
return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
|
@@ -551,6 +566,16 @@ class EmbeddingReqInput:
|
|
551
566
|
return self.rid
|
552
567
|
|
553
568
|
def __getitem__(self, i):
|
569
|
+
if self.is_cross_encoder_request:
|
570
|
+
return EmbeddingReqInput(
|
571
|
+
text=[self.text[i]] if self.text is not None else None,
|
572
|
+
input_ids=None,
|
573
|
+
image_data=None,
|
574
|
+
sampling_params=self.sampling_params[i],
|
575
|
+
rid=self.rid[i],
|
576
|
+
is_cross_encoder_request=True,
|
577
|
+
)
|
578
|
+
|
554
579
|
return EmbeddingReqInput(
|
555
580
|
text=self.text[i] if self.text is not None else None,
|
556
581
|
input_ids=self.input_ids[i] if self.input_ids is not None else None,
|
@@ -570,6 +595,8 @@ class TokenizedEmbeddingReqInput:
|
|
570
595
|
input_ids: List[int]
|
571
596
|
# The image inputs
|
572
597
|
image_inputs: dict
|
598
|
+
# The token type ids
|
599
|
+
token_type_ids: List[int]
|
573
600
|
# Dummy sampling params for compatibility
|
574
601
|
sampling_params: SamplingParams
|
575
602
|
|
@@ -834,6 +861,12 @@ class SetInternalStateReq:
|
|
834
861
|
server_args: Dict[str, Any]
|
835
862
|
|
836
863
|
|
864
|
+
@dataclass
|
865
|
+
class V1RerankReqInput:
|
866
|
+
query: str
|
867
|
+
documents: List[str]
|
868
|
+
|
869
|
+
|
837
870
|
@dataclass
|
838
871
|
class SetInternalStateReqOutput:
|
839
872
|
updated: bool
|
@@ -848,7 +881,8 @@ class ProfileReqInput:
|
|
848
881
|
# If it is set, profiling is automatically stopped after this step, and
|
849
882
|
# the caller doesn't need to run stop_profile.
|
850
883
|
num_steps: Optional[int] = None
|
851
|
-
activities: Optional[List[
|
884
|
+
activities: Optional[List[str]] = None
|
885
|
+
profile_by_stage: bool = False
|
852
886
|
with_stack: Optional[bool] = None
|
853
887
|
record_shapes: Optional[bool] = None
|
854
888
|
|
@@ -875,6 +909,7 @@ class ProfileReq:
|
|
875
909
|
output_dir: Optional[str] = None
|
876
910
|
num_steps: Optional[int] = None
|
877
911
|
activities: Optional[List[str]] = None
|
912
|
+
profile_by_stage: bool = False
|
878
913
|
with_stack: Optional[bool] = None
|
879
914
|
record_shapes: Optional[bool] = None
|
880
915
|
profile_id: Optional[str] = None
|
sglang/srt/managers/mm_utils.py
CHANGED
@@ -252,40 +252,36 @@ def get_embedding_chunk(
|
|
252
252
|
return embedding_chunk, start_index, end_index
|
253
253
|
|
254
254
|
|
255
|
-
def
|
255
|
+
def _get_precomputed_embedding(
|
256
|
+
items: List[MultimodalDataItem],
|
257
|
+
) -> Optional[torch.Tensor]:
|
258
|
+
"""
|
259
|
+
If all items have precomputed_features, return their concatenation.
|
260
|
+
If some but not all have precomputed_features, raise NotImplementedError.
|
261
|
+
If none have precomputed_features, return None.
|
262
|
+
"""
|
263
|
+
precomputed_features = [item.precomputed_features for item in items]
|
264
|
+
if any(feature is not None for feature in precomputed_features):
|
265
|
+
if not all(feature is not None for feature in precomputed_features):
|
266
|
+
raise NotImplementedError(
|
267
|
+
"MM inputs where only some items are precomputed."
|
268
|
+
)
|
269
|
+
result = torch.concat(precomputed_features)
|
270
|
+
# some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
|
271
|
+
result = result.reshape(-1, result.shape[-1])
|
272
|
+
return result
|
273
|
+
return None
|
274
|
+
|
275
|
+
|
276
|
+
def _get_chunked_prefill_embedding(
|
256
277
|
data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
|
257
278
|
embedding_items: List[MultimodalDataItem],
|
258
|
-
placeholder_tensor: torch.Tensor,
|
259
|
-
input_ids: torch.Tensor,
|
260
279
|
items_size: List[int],
|
261
280
|
prefix_length: List[int],
|
262
281
|
extend_length: List[int],
|
263
282
|
items_offset_list: List[List[Tuple[int, int]]],
|
264
|
-
) ->
|
265
|
-
|
266
|
-
Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
|
267
|
-
|
268
|
-
Args:
|
269
|
-
data_embedding_func: Function that generates embeddings for multimodal items
|
270
|
-
embedding_items: List of multimodal items to embed
|
271
|
-
placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
|
272
|
-
input_ids: The input token IDs tensor
|
273
|
-
items_size: Cumulative sizes of multimodal items per request
|
274
|
-
prefix_length: Prefix lengths for each request
|
275
|
-
extend_length: Sequence lengths for each request
|
276
|
-
items_offset_list: List of offset ranges for multimodal items in each request
|
277
|
-
|
278
|
-
Returns:
|
279
|
-
A tuple containing:
|
280
|
-
- The generated embeddings tensor
|
281
|
-
- A boolean mask tensor indicating where these embeddings should be placed
|
282
|
-
|
283
|
-
Raises:
|
284
|
-
AssertionError: If the number of multimodal tokens in input_ids doesn't match
|
285
|
-
the number of tokens in the generated embeddings
|
286
|
-
"""
|
287
|
-
# 1. Get the embedding
|
288
|
-
# Calculate embedding for each request, try to get it from cache to avoid repeated calculation
|
283
|
+
) -> Optional[torch.Tensor]:
|
284
|
+
# Calculate embedding for each request, try to get it from cache to avoid repeated calculation
|
289
285
|
embedding_list = []
|
290
286
|
for i in range(len(items_size) - 1):
|
291
287
|
if items_size[i] == items_size[i + 1]:
|
@@ -321,21 +317,28 @@ def get_embedding_and_mask(
|
|
321
317
|
embedding_cache.free(embedding_items_hash)
|
322
318
|
embedding_list.append(embedding_per_req_chunk)
|
323
319
|
if len(embedding_list) == 0:
|
324
|
-
return None
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
).unsqueeze(-1)
|
320
|
+
return None
|
321
|
+
return torch.concat(embedding_list, dim=0)
|
322
|
+
|
323
|
+
|
324
|
+
def _get_multimodal_mask(
|
325
|
+
input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
|
326
|
+
) -> torch.Tensor:
|
327
|
+
return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
|
332
328
|
|
333
|
-
|
329
|
+
|
330
|
+
def _adjust_embedding_length(
|
331
|
+
embedding: torch.Tensor,
|
332
|
+
mask: torch.Tensor,
|
333
|
+
logger,
|
334
|
+
) -> torch.Tensor:
|
335
|
+
num_mm_tokens_in_embedding = embedding.shape[0]
|
336
|
+
num_mm_tokens_in_input_ids = mask.sum().item()
|
334
337
|
if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
|
335
338
|
logger.warning(
|
336
339
|
f"Number of tokens in multimodal embedding does not match those in the input text. "
|
337
340
|
f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
|
338
|
-
"tokens from multimodal embeddings."
|
341
|
+
f"tokens from multimodal embeddings."
|
339
342
|
)
|
340
343
|
if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
|
341
344
|
chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
|
@@ -353,7 +356,54 @@ def get_embedding_and_mask(
|
|
353
356
|
raise RuntimeError(
|
354
357
|
f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
|
355
358
|
)
|
359
|
+
return embedding
|
360
|
+
|
361
|
+
|
362
|
+
def get_embedding_and_mask(
|
363
|
+
data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
|
364
|
+
embedding_items: List[MultimodalDataItem],
|
365
|
+
placeholder_tensor: torch.Tensor,
|
366
|
+
input_ids: torch.Tensor,
|
367
|
+
items_size: List[int],
|
368
|
+
prefix_length: List[int],
|
369
|
+
extend_length: List[int],
|
370
|
+
items_offset_list: List[List[Tuple[int, int]]],
|
371
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
372
|
+
"""
|
373
|
+
Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
|
356
374
|
|
375
|
+
Args:
|
376
|
+
data_embedding_func: Function that generates embeddings for multimodal items
|
377
|
+
embedding_items: List of multimodal items to embed
|
378
|
+
placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
|
379
|
+
input_ids: The input token IDs tensor
|
380
|
+
items_size: Cumulative sizes of multimodal items per request
|
381
|
+
prefix_length: Prefix lengths for each request
|
382
|
+
extend_length: Sequence lengths for each request
|
383
|
+
items_offset_list: List of offset ranges for multimodal items in each request
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
A tuple containing:
|
387
|
+
- The generated embeddings tensor
|
388
|
+
- A boolean mask tensor indicating where these embeddings should be placed
|
389
|
+
"""
|
390
|
+
# 1. Get embedding
|
391
|
+
embedding = _get_precomputed_embedding(embedding_items)
|
392
|
+
if embedding is None:
|
393
|
+
embedding = _get_chunked_prefill_embedding(
|
394
|
+
data_embedding_func,
|
395
|
+
embedding_items,
|
396
|
+
items_size,
|
397
|
+
prefix_length,
|
398
|
+
extend_length,
|
399
|
+
items_offset_list,
|
400
|
+
)
|
401
|
+
if embedding is None:
|
402
|
+
return None, None
|
403
|
+
# 2. Get mask
|
404
|
+
special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
|
405
|
+
# 3. Adjust embedding length if needed
|
406
|
+
embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
|
357
407
|
return embedding, special_multimodal_mask
|
358
408
|
|
359
409
|
|
@@ -5,7 +5,8 @@ import multiprocessing as mp
|
|
5
5
|
import os
|
6
6
|
import re
|
7
7
|
from abc import ABC, abstractmethod
|
8
|
-
from
|
8
|
+
from enum import Enum
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
9
10
|
|
10
11
|
import numpy as np
|
11
12
|
import torch
|
@@ -16,16 +17,24 @@ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
16
17
|
from sglang.srt.utils import encode_video, load_audio, load_image
|
17
18
|
|
18
19
|
|
20
|
+
class MultimodalInputFormat(Enum):
|
21
|
+
"""Enum for different multimodal input formats."""
|
22
|
+
|
23
|
+
RAW_IMAGES = "raw_images"
|
24
|
+
PRECOMPUTED_FEATURES = "precomputed_features"
|
25
|
+
PIXEL_VALUES = "pixel_values"
|
26
|
+
|
27
|
+
|
19
28
|
@dataclasses.dataclass
|
20
29
|
class BaseMultiModalProcessorOutput:
|
21
30
|
# input_text, with each frame of video/image represented with a image_token
|
22
31
|
input_text: str
|
23
32
|
|
24
33
|
# frames loaded from image and video, in given order
|
25
|
-
images: Optional[list[Union[Image.Image,
|
34
|
+
images: Optional[list[Union[Image.Image, dict]]] = None
|
26
35
|
|
27
36
|
# audios
|
28
|
-
audios: Optional[list[Union[np.ndarray,
|
37
|
+
audios: Optional[list[Union[np.ndarray, dict]]] = None
|
29
38
|
|
30
39
|
def normalize(self):
|
31
40
|
for field_name in ["images", "audios"]:
|
@@ -137,7 +146,7 @@ class BaseMultimodalProcessor(ABC):
|
|
137
146
|
request_obj,
|
138
147
|
max_req_input_len,
|
139
148
|
**kwargs,
|
140
|
-
):
|
149
|
+
) -> Optional[Dict[str, Any]]:
|
141
150
|
pass
|
142
151
|
|
143
152
|
def get_estimated_frames_list(self, image_data):
|
@@ -170,8 +179,6 @@ class BaseMultimodalProcessor(ABC):
|
|
170
179
|
):
|
171
180
|
"""Static method that can be pickled for multiprocessing"""
|
172
181
|
if isinstance(data, dict):
|
173
|
-
return MultimodalDataItem.from_dict(data)
|
174
|
-
if isinstance(data, MultimodalDataItem):
|
175
182
|
return data
|
176
183
|
try:
|
177
184
|
if is_audio:
|
@@ -254,7 +261,7 @@ class BaseMultimodalProcessor(ABC):
|
|
254
261
|
|
255
262
|
def load_mm_data(
|
256
263
|
self,
|
257
|
-
prompt: str,
|
264
|
+
prompt: str | List[int],
|
258
265
|
multimodal_tokens: MultimodalSpecialTokens,
|
259
266
|
max_req_input_len: int,
|
260
267
|
image_data: Optional[list] = None,
|
@@ -370,15 +377,180 @@ class BaseMultimodalProcessor(ABC):
|
|
370
377
|
|
371
378
|
return list(zip(indices_start.tolist(), indices_end.tolist()))
|
372
379
|
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
)
|
381
|
-
|
382
|
-
|
380
|
+
@staticmethod
|
381
|
+
def _extract_processor_features(
|
382
|
+
items: List[dict], attr_name: str
|
383
|
+
) -> Optional[torch.Tensor]:
|
384
|
+
"""
|
385
|
+
Helper function to concat extracted attributes from processor output.
|
386
|
+
"""
|
387
|
+
values = [value for item in items if (value := item.get(attr_name)) is not None]
|
388
|
+
return torch.cat(values) if values else None
|
389
|
+
|
390
|
+
# When we assume that all the items have the same attributes
|
391
|
+
def _extract_processor_features_from_all_attributes(
|
392
|
+
self, items: List[dict]
|
393
|
+
) -> dict:
|
394
|
+
values = {}
|
395
|
+
# Verify all items have the same keys
|
396
|
+
first_keys = set(items[0].keys())
|
397
|
+
for item in items[1:]:
|
398
|
+
if set(item.keys()) != first_keys:
|
399
|
+
raise ValueError(
|
400
|
+
f"All items must have the same attributes. "
|
401
|
+
f"First item has {first_keys}, but found {set(item.keys())}"
|
402
|
+
)
|
403
|
+
|
404
|
+
# Process each attribute
|
405
|
+
for k, v in items[0].items():
|
406
|
+
if isinstance(v, list):
|
407
|
+
values[k] = self._extract_processor_features(items, k)
|
408
|
+
else:
|
409
|
+
# Verify all items have the same value for non-list attributes
|
410
|
+
for item in items[1:]:
|
411
|
+
if item[k] != v:
|
412
|
+
raise ValueError(
|
413
|
+
f"All items must have the same value for attribute {k}. "
|
414
|
+
f"First item has {v}, but found {item[k]}"
|
415
|
+
)
|
416
|
+
values[k] = v
|
417
|
+
return values
|
418
|
+
|
419
|
+
def process_and_combine_mm_data(
|
420
|
+
self, base_output: BaseMultiModalProcessorOutput
|
421
|
+
) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
|
422
|
+
"""
|
423
|
+
Process multimodal data and return the combined multimodal item and input_ids.
|
424
|
+
Handles all three input formats at the same abstraction level.
|
425
|
+
|
426
|
+
Returns:
|
427
|
+
Tuple of (combined_mm_item, input_ids)
|
428
|
+
"""
|
429
|
+
|
430
|
+
def tokenize_text(input_text: str) -> torch.Tensor:
|
431
|
+
"""Tokenize input text."""
|
432
|
+
return self._processor.tokenizer(
|
433
|
+
input_text,
|
434
|
+
return_tensors="pt",
|
435
|
+
add_special_tokens=True,
|
436
|
+
).input_ids.flatten()
|
437
|
+
|
438
|
+
def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
|
439
|
+
"""Categorize multimodal inputs and validate consistency."""
|
440
|
+
try:
|
441
|
+
has_image = False
|
442
|
+
has_pixel_values = False
|
443
|
+
has_precomputed_features = False
|
444
|
+
|
445
|
+
for mm_input in mm_inputs:
|
446
|
+
if isinstance(mm_input, Image.Image):
|
447
|
+
has_image = True
|
448
|
+
elif isinstance(mm_input, dict):
|
449
|
+
if mm_input.get("precomputed_features", None) is not None:
|
450
|
+
has_precomputed_features = True
|
451
|
+
elif mm_input.get("pixel_values", None) is not None:
|
452
|
+
has_pixel_values = True
|
453
|
+
else:
|
454
|
+
raise ValueError(
|
455
|
+
f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
|
456
|
+
)
|
457
|
+
else:
|
458
|
+
raise ValueError(
|
459
|
+
f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
|
460
|
+
)
|
461
|
+
|
462
|
+
# Validate format consistency
|
463
|
+
format_count = sum(
|
464
|
+
[has_image, has_pixel_values, has_precomputed_features]
|
465
|
+
)
|
466
|
+
if format_count > 1:
|
467
|
+
raise ValueError(
|
468
|
+
"Unsupported: mixture of multimodal input formats. "
|
469
|
+
f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
|
470
|
+
f"precomputed_features={has_precomputed_features}"
|
471
|
+
)
|
472
|
+
|
473
|
+
if has_image:
|
474
|
+
return MultimodalInputFormat.RAW_IMAGES
|
475
|
+
elif has_precomputed_features:
|
476
|
+
return MultimodalInputFormat.PRECOMPUTED_FEATURES
|
477
|
+
elif has_pixel_values:
|
478
|
+
return MultimodalInputFormat.PIXEL_VALUES
|
479
|
+
else:
|
480
|
+
raise ValueError("No valid multimodal input format found")
|
481
|
+
except Exception as e:
|
482
|
+
raise ValueError(f"Failed to categorize inputs: {e}")
|
483
|
+
|
484
|
+
def process_raw_images(
|
485
|
+
base_output: BaseMultiModalProcessorOutput,
|
486
|
+
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
487
|
+
"""Process raw Image.Image objects using transformers processor."""
|
488
|
+
ret = self.process_mm_data(
|
489
|
+
input_text=base_output.input_text,
|
490
|
+
images=base_output.images,
|
491
|
+
)
|
492
|
+
combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
|
493
|
+
|
494
|
+
# Copy all fields from processor output except input_ids
|
495
|
+
for key, value in ret.items():
|
496
|
+
if key != "input_ids" and hasattr(combined_mm_item, key):
|
497
|
+
setattr(combined_mm_item, key, value)
|
498
|
+
|
499
|
+
input_ids = ret["input_ids"].flatten()
|
500
|
+
return combined_mm_item, input_ids
|
501
|
+
|
502
|
+
def process_precomputed_features(
|
503
|
+
base_output: BaseMultiModalProcessorOutput,
|
504
|
+
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
505
|
+
"""Process inputs with precomputed features."""
|
506
|
+
combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
|
507
|
+
combined_mm_item.precomputed_features = self._extract_processor_features(
|
508
|
+
base_output.images, "precomputed_features"
|
383
509
|
)
|
384
|
-
|
510
|
+
input_ids = tokenize_text(base_output.input_text)
|
511
|
+
return combined_mm_item, input_ids
|
512
|
+
|
513
|
+
def process_pixel_values(
|
514
|
+
base_output: BaseMultiModalProcessorOutput,
|
515
|
+
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
516
|
+
"""Process inputs with pixel values."""
|
517
|
+
values = self._extract_processor_features_from_all_attributes(
|
518
|
+
base_output.images
|
519
|
+
)
|
520
|
+
combined_mm_item = MultimodalDataItem.from_dict(values)
|
521
|
+
input_ids = tokenize_text(base_output.input_text)
|
522
|
+
return combined_mm_item, input_ids
|
523
|
+
|
524
|
+
def finalize_mm_item(
|
525
|
+
combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
|
526
|
+
) -> MultimodalDataItem:
|
527
|
+
"""Apply common post-processing to the multimodal item."""
|
528
|
+
combined_mm_item.image_offsets = self.get_mm_items_offset(
|
529
|
+
input_ids=input_ids,
|
530
|
+
mm_token_id=self.IM_TOKEN_ID,
|
531
|
+
)
|
532
|
+
return combined_mm_item
|
533
|
+
|
534
|
+
# Main logic
|
535
|
+
mm_inputs = base_output.images
|
536
|
+
if not mm_inputs:
|
537
|
+
# Return text-only case
|
538
|
+
input_ids = tokenize_text(base_output.input_text)
|
539
|
+
return None, input_ids
|
540
|
+
|
541
|
+
# Categorize input formats
|
542
|
+
input_format = categorize_mm_inputs(mm_inputs)
|
543
|
+
|
544
|
+
# Process based on format
|
545
|
+
if input_format == MultimodalInputFormat.RAW_IMAGES:
|
546
|
+
combined_mm_item, input_ids = process_raw_images(base_output)
|
547
|
+
elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
|
548
|
+
combined_mm_item, input_ids = process_precomputed_features(base_output)
|
549
|
+
elif input_format == MultimodalInputFormat.PIXEL_VALUES:
|
550
|
+
combined_mm_item, input_ids = process_pixel_values(base_output)
|
551
|
+
else:
|
552
|
+
raise ValueError(f"Unknown input format: {input_format}")
|
553
|
+
|
554
|
+
# Finalize with common processing
|
555
|
+
combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
|
556
|
+
return combined_mm_item, input_ids
|
@@ -27,6 +27,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
27
27
|
)
|
28
28
|
self.IM_START_TOKEN_ID = hf_config.boi_token_index
|
29
29
|
self.IM_END_TOKEN_ID = hf_config.eoi_token_index
|
30
|
+
self.IM_TOKEN_ID = hf_config.image_token_index
|
30
31
|
|
31
32
|
async def process_mm_data_async(
|
32
33
|
self,
|
@@ -42,49 +43,21 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
42
43
|
if isinstance(image_data, str):
|
43
44
|
image_data = [image_data]
|
44
45
|
|
45
|
-
image_token = self.IMAGE_TOKEN
|
46
|
-
image_token_regex = self.IMAGE_TOKEN_REGEX
|
47
46
|
base_output = self.load_mm_data(
|
48
47
|
prompt=input_text,
|
49
48
|
image_data=image_data,
|
50
49
|
multimodal_tokens=MultimodalSpecialTokens(
|
51
|
-
image_token=
|
50
|
+
image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
|
52
51
|
),
|
53
52
|
max_req_input_len=max_req_input_len,
|
54
53
|
discard_alpha_channel=True,
|
55
54
|
)
|
56
55
|
|
57
|
-
|
58
|
-
ret = self.process_mm_data(
|
59
|
-
input_text=base_output.input_text,
|
60
|
-
images=None if images_are_preprocessed else base_output.images,
|
61
|
-
)
|
62
|
-
|
63
|
-
items = []
|
64
|
-
input_ids = ret["input_ids"].flatten()
|
65
|
-
image_offsets = self.get_mm_items_offset(
|
66
|
-
input_ids=input_ids,
|
67
|
-
mm_token_id=self.hf_config.image_token_index,
|
68
|
-
)
|
69
|
-
for i, image in enumerate(base_output.images):
|
70
|
-
if images_are_preprocessed:
|
71
|
-
pixel_values = image.pixel_values
|
72
|
-
precomputed_features = image.precomputed_features
|
73
|
-
else:
|
74
|
-
pixel_values = ret["pixel_values"][i]
|
75
|
-
precomputed_features = None
|
76
|
-
|
77
|
-
item = MultimodalDataItem(
|
78
|
-
pixel_values=pixel_values,
|
79
|
-
precomputed_features=precomputed_features,
|
80
|
-
modality=Modality.IMAGE,
|
81
|
-
image_offsets=image_offsets[i],
|
82
|
-
)
|
83
|
-
items += [item]
|
56
|
+
combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
|
84
57
|
|
85
58
|
return {
|
86
|
-
"mm_items": items,
|
87
59
|
"input_ids": input_ids.tolist(),
|
60
|
+
"mm_items": [combined_mm_item] if combined_mm_item is not None else [],
|
88
61
|
"im_start_id": self.IM_START_TOKEN_ID,
|
89
62
|
"im_end_id": self.IM_END_TOKEN_ID,
|
90
63
|
}
|