sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/managers/mm_utils.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
Multi-modality utils
|
3
3
|
"""
|
4
4
|
|
5
|
+
import dataclasses
|
5
6
|
import logging
|
6
7
|
from abc import abstractmethod
|
7
8
|
from typing import Callable, List, Optional, Tuple
|
@@ -15,10 +16,15 @@ from sglang.srt.managers.schedule_batch import (
|
|
15
16
|
MultimodalInputs,
|
16
17
|
global_server_args_dict,
|
17
18
|
)
|
19
|
+
from sglang.srt.mem_cache.multimodal_cache import MultiModalCache
|
18
20
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
19
21
|
from sglang.srt.utils import flatten_nested_list, print_warning_once
|
22
|
+
from sglang.utils import logger
|
20
23
|
|
21
|
-
logger
|
24
|
+
# NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
|
25
|
+
# to ensure consistent logging behavior across the codebase. This prevents issues with log
|
26
|
+
# propagation that can cause some log messages (like 'server is fired up') to not appear
|
27
|
+
# in the console when multimodal support is enabled.
|
22
28
|
|
23
29
|
|
24
30
|
class MultiModalityDataPaddingPattern:
|
@@ -41,11 +47,26 @@ class MultiModalityDataPaddingPattern:
|
|
41
47
|
class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern):
|
42
48
|
"""In this pattern, data tokens should be enclosed by special token pairs (e.g. <image>...</image>, data_token_pairs)
|
43
49
|
|
50
|
+
The padded value in a region enclosed by a token pair with be the same one, as the MultimodalDataItem's pad value
|
51
|
+
|
44
52
|
This strategy should be applied when data content is marked by start/end token pairs in the input sequence.
|
45
53
|
"""
|
46
54
|
|
47
|
-
def __init__(
|
55
|
+
def __init__(
|
56
|
+
self,
|
57
|
+
data_token_pairs: Optional[List[Tuple[int, int]]],
|
58
|
+
data_start_token_ids: Optional[List[int]] = None,
|
59
|
+
) -> None:
|
60
|
+
"""
|
61
|
+
|
62
|
+
Args:
|
63
|
+
data_start_token_ids marks the start of a single multimodal data
|
64
|
+
See Minicpmo's slice_start_id for example
|
65
|
+
"""
|
48
66
|
self.data_token_id_pairs = data_token_pairs
|
67
|
+
self.data_start_token_ids = data_start_token_ids or [
|
68
|
+
s for s, _e in data_token_pairs
|
69
|
+
]
|
49
70
|
|
50
71
|
def pad_input_tokens(
|
51
72
|
self, input_ids: List[int], mm_inputs: MultimodalInputs
|
@@ -79,7 +100,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
|
|
79
100
|
for start_idx, end_idx in zip(start_indices, end_indices):
|
80
101
|
padded_ids.extend(input_ids[last_idx : start_idx + 1])
|
81
102
|
|
82
|
-
if input_ids[start_idx] in
|
103
|
+
if input_ids[start_idx] in self.data_start_token_ids:
|
83
104
|
data_idx += 1
|
84
105
|
mm_inputs.data_offsets += [start_idx]
|
85
106
|
|
@@ -170,46 +191,156 @@ class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPa
|
|
170
191
|
output_ids_tensor[start_idx:end_idx] = pad_value
|
171
192
|
else:
|
172
193
|
logger.warning(f"Skipping region {i} due to None pad_value.")
|
173
|
-
|
174
194
|
return output_ids_tensor.tolist()
|
175
195
|
|
176
196
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
)
|
197
|
+
embedding_cache = None
|
198
|
+
|
199
|
+
|
200
|
+
def init_embedding_cache(max_size: int):
|
201
|
+
global embedding_cache
|
202
|
+
embedding_cache = MultiModalCache(max_size)
|
203
|
+
|
204
|
+
|
205
|
+
def get_embedding_hash(embedding_items: List[MultimodalDataItem]) -> int:
|
206
|
+
hash_list = [item.hash for item in embedding_items]
|
207
|
+
return hash(tuple(hash_list))
|
208
|
+
|
209
|
+
|
210
|
+
def get_embedding_chunk(
|
211
|
+
embedding: torch.Tensor,
|
212
|
+
extend_prefix_len: int,
|
213
|
+
extend_seq_len: int,
|
214
|
+
items_offset: List[Tuple[int, int]],
|
215
|
+
) -> Tuple[torch.Tensor, int, int]:
|
183
216
|
"""
|
184
|
-
|
217
|
+
Extract a chunk of embeddings based on the specified prefix length, sequence length, and offset ranges.
|
185
218
|
|
219
|
+
Args:
|
220
|
+
embedding: The full embedding tensor to extract a chunk from
|
221
|
+
extend_prefix_len: The starting position (prefix length) for extraction
|
222
|
+
extend_seq_len: The number of tokens to extract
|
223
|
+
items_offset: List of [start, end] offset ranges for multimodal items in the input sequence
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
A tuple containing:
|
227
|
+
- The extracted embedding chunk as a tensor
|
228
|
+
- The start index used for extraction
|
229
|
+
- The end index used for extraction
|
230
|
+
|
231
|
+
Note:
|
232
|
+
If there's no overlap between the requested range and the offset ranges,
|
233
|
+
an empty tensor is returned with zeros for start and end indices.
|
234
|
+
"""
|
235
|
+
start_index, end_index = 0, 0
|
236
|
+
extend_start_index = extend_prefix_len
|
237
|
+
extend_end_index = extend_prefix_len + extend_seq_len - 1
|
238
|
+
|
239
|
+
for start, end in items_offset:
|
240
|
+
if extend_start_index >= start and extend_start_index <= end:
|
241
|
+
start_index += extend_start_index - start
|
242
|
+
elif extend_start_index > end:
|
243
|
+
start_index += end - start + 1
|
244
|
+
|
245
|
+
if extend_end_index >= start and extend_end_index <= end:
|
246
|
+
end_index += extend_end_index - start + 1
|
247
|
+
elif extend_end_index > end:
|
248
|
+
end_index += end - start + 1
|
249
|
+
# some models embedding is 3-dim, reshape it to 2-dim
|
250
|
+
embedding = embedding.reshape(-1, embedding.shape[-1])
|
251
|
+
embedding_chunk = embedding[start_index:end_index]
|
252
|
+
return embedding_chunk, start_index, end_index
|
253
|
+
|
254
|
+
|
255
|
+
def _get_precomputed_embedding(
|
256
|
+
items: List[MultimodalDataItem],
|
257
|
+
) -> Optional[torch.Tensor]:
|
258
|
+
"""
|
259
|
+
If all items have precomputed_features, return their concatenation.
|
260
|
+
If some but not all have precomputed_features, raise NotImplementedError.
|
261
|
+
If none have precomputed_features, return None.
|
186
262
|
"""
|
187
|
-
|
188
|
-
|
263
|
+
precomputed_features = [item.precomputed_features for item in items]
|
264
|
+
if any(feature is not None for feature in precomputed_features):
|
265
|
+
if not all(feature is not None for feature in precomputed_features):
|
266
|
+
raise NotImplementedError(
|
267
|
+
"MM inputs where only some items are precomputed."
|
268
|
+
)
|
269
|
+
result = torch.concat(precomputed_features)
|
270
|
+
# some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
|
271
|
+
result = result.reshape(-1, result.shape[-1])
|
272
|
+
return result
|
273
|
+
return None
|
189
274
|
|
190
|
-
# 2. Check the embedding
|
191
|
-
if embedding.dim() == 2:
|
192
|
-
num_mm_tokens_in_embedding = embedding.shape[0]
|
193
|
-
else:
|
194
|
-
num_mm_tokens_in_embedding = embedding.shape[0] * embedding.shape[1]
|
195
275
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
276
|
+
def _get_chunked_prefill_embedding(
|
277
|
+
data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
|
278
|
+
embedding_items: List[MultimodalDataItem],
|
279
|
+
items_size: List[int],
|
280
|
+
prefix_length: List[int],
|
281
|
+
extend_length: List[int],
|
282
|
+
items_offset_list: List[List[Tuple[int, int]]],
|
283
|
+
) -> Optional[torch.Tensor]:
|
284
|
+
# Calculate embedding for each request, try to get it from cache to avoid repeated calculation
|
285
|
+
embedding_list = []
|
286
|
+
for i in range(len(items_size) - 1):
|
287
|
+
if items_size[i] == items_size[i + 1]:
|
288
|
+
continue
|
289
|
+
embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
|
290
|
+
items_offset = items_offset_list[i]
|
291
|
+
embedding_items_hash = get_embedding_hash(embedding_items_per_req)
|
292
|
+
# if all items has been prefixed, we do not need to calculate embedding
|
293
|
+
if all([offset_end < prefix_length[i] for _, offset_end in items_offset]):
|
294
|
+
continue
|
295
|
+
embedding_per_req = embedding_cache.get(embedding_items_hash)
|
296
|
+
if embedding_per_req is None:
|
297
|
+
embedding_per_req = data_embedding_func(embedding_items_per_req)
|
298
|
+
if not embedding_cache.put(embedding_items_hash, embedding_per_req):
|
299
|
+
print_warning_once(
|
300
|
+
"Multimodal embedding cache is full. Consider increasing the "
|
301
|
+
"`SGLANG_VLM_CACHE_SIZE_MB` environment variable."
|
302
|
+
)
|
303
|
+
|
304
|
+
embedding_per_req_chunk, _, end_index = get_embedding_chunk(
|
305
|
+
embedding=embedding_per_req,
|
306
|
+
extend_prefix_len=prefix_length[i],
|
307
|
+
extend_seq_len=extend_length[i],
|
308
|
+
items_offset=items_offset,
|
309
|
+
)
|
310
|
+
# remove this item from cache if chunk reaches to the end
|
311
|
+
embedding_per_req_length = (
|
312
|
+
embedding_per_req.shape[0]
|
313
|
+
if embedding_per_req.dim() == 2
|
314
|
+
else embedding_per_req.shape[0] * embedding_per_req.shape[1]
|
315
|
+
)
|
316
|
+
if end_index == embedding_per_req_length:
|
317
|
+
embedding_cache.free(embedding_items_hash)
|
318
|
+
embedding_list.append(embedding_per_req_chunk)
|
319
|
+
if len(embedding_list) == 0:
|
320
|
+
return None
|
321
|
+
return torch.concat(embedding_list, dim=0)
|
322
|
+
|
323
|
+
|
324
|
+
def _get_multimodal_mask(
|
325
|
+
input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
|
326
|
+
) -> torch.Tensor:
|
327
|
+
return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
|
201
328
|
|
202
|
-
|
329
|
+
|
330
|
+
def _adjust_embedding_length(
|
331
|
+
embedding: torch.Tensor,
|
332
|
+
mask: torch.Tensor,
|
333
|
+
logger,
|
334
|
+
) -> torch.Tensor:
|
335
|
+
num_mm_tokens_in_embedding = embedding.shape[0]
|
336
|
+
num_mm_tokens_in_input_ids = mask.sum().item()
|
203
337
|
if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
|
204
338
|
logger.warning(
|
205
|
-
f"Number of tokens in multimodal embedding does not match those in the input text."
|
339
|
+
f"Number of tokens in multimodal embedding does not match those in the input text. "
|
206
340
|
f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
|
207
|
-
"tokens from multimodal embeddings."
|
341
|
+
f"tokens from multimodal embeddings."
|
208
342
|
)
|
209
343
|
if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
|
210
|
-
# TODO: chunked prefill will split special tokens from input_ids into several passes, failing the embedding
|
211
|
-
# a fix may be cache the unfinished multimodal embedding for future reuse, determine the tokens to embed with
|
212
|
-
# extend_start_loc and extend_seq_lens
|
213
344
|
chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
|
214
345
|
if chunked_prefill_size != -1:
|
215
346
|
logger.warning(
|
@@ -225,12 +356,61 @@ def get_embedding_and_mask(
|
|
225
356
|
raise RuntimeError(
|
226
357
|
f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
|
227
358
|
)
|
359
|
+
return embedding
|
360
|
+
|
228
361
|
|
362
|
+
def get_embedding_and_mask(
|
363
|
+
data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
|
364
|
+
embedding_items: List[MultimodalDataItem],
|
365
|
+
placeholder_tensor: torch.Tensor,
|
366
|
+
input_ids: torch.Tensor,
|
367
|
+
items_size: List[int],
|
368
|
+
prefix_length: List[int],
|
369
|
+
extend_length: List[int],
|
370
|
+
items_offset_list: List[List[Tuple[int, int]]],
|
371
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
372
|
+
"""
|
373
|
+
Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
|
374
|
+
|
375
|
+
Args:
|
376
|
+
data_embedding_func: Function that generates embeddings for multimodal items
|
377
|
+
embedding_items: List of multimodal items to embed
|
378
|
+
placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
|
379
|
+
input_ids: The input token IDs tensor
|
380
|
+
items_size: Cumulative sizes of multimodal items per request
|
381
|
+
prefix_length: Prefix lengths for each request
|
382
|
+
extend_length: Sequence lengths for each request
|
383
|
+
items_offset_list: List of offset ranges for multimodal items in each request
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
A tuple containing:
|
387
|
+
- The generated embeddings tensor
|
388
|
+
- A boolean mask tensor indicating where these embeddings should be placed
|
389
|
+
"""
|
390
|
+
# 1. Get embedding
|
391
|
+
embedding = _get_precomputed_embedding(embedding_items)
|
392
|
+
if embedding is None:
|
393
|
+
embedding = _get_chunked_prefill_embedding(
|
394
|
+
data_embedding_func,
|
395
|
+
embedding_items,
|
396
|
+
items_size,
|
397
|
+
prefix_length,
|
398
|
+
extend_length,
|
399
|
+
items_offset_list,
|
400
|
+
)
|
401
|
+
if embedding is None:
|
402
|
+
return None, None
|
403
|
+
# 2. Get mask
|
404
|
+
special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
|
405
|
+
# 3. Adjust embedding length if needed
|
406
|
+
embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
|
229
407
|
return embedding, special_multimodal_mask
|
230
408
|
|
231
409
|
|
232
410
|
def embed_mm_inputs(
|
233
|
-
|
411
|
+
mm_inputs_list: List[MultimodalInputs],
|
412
|
+
extend_prefix_lens: List[int],
|
413
|
+
extend_seq_lens: List[int],
|
234
414
|
input_ids: torch.Tensor,
|
235
415
|
input_embedding: nn.Embedding,
|
236
416
|
image_data_embedding_func: Callable[
|
@@ -242,125 +422,133 @@ def embed_mm_inputs(
|
|
242
422
|
placeholder_tokens: dict[Modality, List[int]] = None,
|
243
423
|
) -> Optional[torch.Tensor]:
|
244
424
|
"""
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
425
|
+
Embed multimodal inputs and integrate them with text token embeddings.
|
426
|
+
|
427
|
+
Args:
|
428
|
+
mm_inputs_list: List of multimodal inputs to process
|
429
|
+
extend_prefix_lens: Prefix lengths for each request
|
430
|
+
extend_seq_lens: Sequence lengths for each request
|
431
|
+
input_ids: Input token IDs tensor
|
432
|
+
input_embedding: Embedding layer for text tokens
|
433
|
+
image_data_embedding_func: Function to embed image data
|
434
|
+
audio_data_embedding_func: Function to embed audio data
|
435
|
+
placeholder_tokens: Token IDs for multimodal placeholders (uses pad_values if None)
|
250
436
|
|
251
|
-
|
252
|
-
|
437
|
+
Returns:
|
438
|
+
Combined embedding tensor with multimodal content integrated
|
253
439
|
"""
|
254
440
|
|
255
|
-
if
|
441
|
+
if mm_inputs_list is None:
|
256
442
|
return None
|
257
443
|
|
258
444
|
# 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
|
259
445
|
# we assume that multimodal data are represented with its pad_values in input_ids
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
if placeholder_tokens is not None:
|
264
|
-
placeholder_token_ids = flatten_nested_list(
|
265
|
-
[placeholder_token for placeholder_token in placeholder_tokens.values()]
|
266
|
-
)
|
267
|
-
else:
|
268
|
-
placeholder_token_ids = [item.pad_value for item in mm_inputs.mm_items]
|
269
|
-
|
270
|
-
assert isinstance(placeholder_token_ids[0], int)
|
271
|
-
|
272
|
-
placeholder_tensor = torch.tensor(placeholder_token_ids, device=input_ids.device)
|
273
|
-
|
274
|
-
placeholder_masks = torch.isin(input_ids, placeholder_tensor)
|
275
|
-
|
276
|
-
appearing_pad_values = torch.unique(
|
277
|
-
input_ids[placeholder_masks], return_counts=False
|
278
|
-
)
|
446
|
+
item_flatten_list = []
|
447
|
+
for mm_inputs in mm_inputs_list:
|
448
|
+
item_flatten_list += [item for item in mm_inputs.mm_items if item is not None]
|
279
449
|
|
280
|
-
|
281
|
-
# all been prefixed
|
282
|
-
inputs_embeds = input_embedding(input_ids)
|
283
|
-
else:
|
284
|
-
appearing_items = [
|
285
|
-
item
|
286
|
-
for item in mm_inputs.mm_items
|
287
|
-
if item.pad_value is not None and item.pad_value in appearing_pad_values
|
288
|
-
]
|
450
|
+
embeddings, masks = [], []
|
289
451
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
452
|
+
# 2. Get multimodal embedding separately
|
453
|
+
# TODO: make this more generic
|
454
|
+
# Try get image embedding if any
|
455
|
+
if (
|
456
|
+
any(True for item in item_flatten_list if item.is_image())
|
457
|
+
and image_data_embedding_func
|
458
|
+
):
|
459
|
+
items = [item for item in item_flatten_list if item.is_image()]
|
460
|
+
placeholder_tensor = torch.tensor(
|
461
|
+
[item.pad_value for item in items],
|
462
|
+
device=input_ids.device,
|
463
|
+
)
|
464
|
+
# calculate per request items length offset
|
465
|
+
items_size = torch.zeros(len(mm_inputs_list) + 1, dtype=int)
|
466
|
+
items_offsets = []
|
467
|
+
for i, mm_inputs in enumerate(mm_inputs_list):
|
468
|
+
image_items = [item for item in mm_inputs.mm_items if item.is_image()]
|
469
|
+
items_size[i + 1] = len(image_items)
|
470
|
+
items_offsets.append(
|
471
|
+
flatten_nested_list(
|
472
|
+
[
|
473
|
+
item.image_offsets
|
474
|
+
for item in mm_inputs.mm_items
|
475
|
+
if item.is_image()
|
476
|
+
]
|
477
|
+
)
|
295
478
|
)
|
296
|
-
|
297
|
-
appearing_items = mm_inputs.mm_items
|
479
|
+
items_size = torch.cumsum(items_size, dim=0).tolist()
|
298
480
|
|
299
|
-
|
481
|
+
embedding, mask = get_embedding_and_mask(
|
482
|
+
data_embedding_func=image_data_embedding_func,
|
483
|
+
embedding_items=items,
|
484
|
+
placeholder_tensor=placeholder_tensor,
|
485
|
+
input_ids=input_ids,
|
486
|
+
items_size=items_size,
|
487
|
+
prefix_length=extend_prefix_lens,
|
488
|
+
extend_length=extend_seq_lens,
|
489
|
+
items_offset_list=items_offsets,
|
490
|
+
)
|
491
|
+
embeddings += [embedding]
|
492
|
+
masks += [mask]
|
300
493
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
494
|
+
# Try get audio embedding if any
|
495
|
+
if (
|
496
|
+
any(True for item in item_flatten_list if item.is_audio())
|
497
|
+
and audio_data_embedding_func
|
498
|
+
):
|
499
|
+
items = [item for item in item_flatten_list if item.is_audio()]
|
500
|
+
placeholder_tensor = torch.tensor(
|
501
|
+
[item.pad_value for item in items],
|
502
|
+
device=input_ids.device,
|
503
|
+
)
|
504
|
+
items_offsets = []
|
505
|
+
# calculate per request items length offset
|
506
|
+
items_size = torch.zeros(len(mm_inputs_list) + 1, dtype=int)
|
507
|
+
for i, mm_inputs in enumerate(mm_inputs_list):
|
508
|
+
audio_items = [item for item in mm_inputs.mm_items if item.is_audio()]
|
509
|
+
items_size[i + 1] = len(audio_items)
|
510
|
+
items_offsets.append(
|
511
|
+
flatten_nested_list(
|
512
|
+
[
|
513
|
+
item.audio_offsets
|
514
|
+
for item in mm_inputs.mm_items
|
515
|
+
if item.is_audio()
|
516
|
+
]
|
517
|
+
)
|
322
518
|
)
|
323
|
-
|
324
|
-
masks += [mask]
|
519
|
+
items_size = torch.cumsum(items_size, dim=0)
|
325
520
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
# 4. Scatter embeddings into input embedding
|
358
|
-
for embedding, mask in zip(embeddings, masks):
|
359
|
-
mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
360
|
-
inputs_embeds = inputs_embeds.masked_scatter(
|
361
|
-
mask,
|
362
|
-
embedding.to(inputs_embeds.device, inputs_embeds.dtype),
|
363
|
-
)
|
521
|
+
embedding, mask = get_embedding_and_mask(
|
522
|
+
data_embedding_func=audio_data_embedding_func,
|
523
|
+
embedding_items=items,
|
524
|
+
placeholder_tensor=placeholder_tensor,
|
525
|
+
input_ids=input_ids,
|
526
|
+
items_size=items_size,
|
527
|
+
prefix_length=extend_prefix_lens,
|
528
|
+
extend_length=extend_seq_lens,
|
529
|
+
items_offset_list=items_offsets,
|
530
|
+
)
|
531
|
+
embeddings += [embedding]
|
532
|
+
masks += [mask]
|
533
|
+
|
534
|
+
# 3. Get input embeddings
|
535
|
+
vocab_size = input_embedding.num_embeddings
|
536
|
+
# Important: clamp after getting original multimodal regions
|
537
|
+
# Clamp input ids. This is because the input_ids for the multimodal tokens are
|
538
|
+
# filled with the hash values of the multimodal for the prefix matching in the radix attention.
|
539
|
+
# There values are useless because their embeddings will be replaced by vision embeddings anyway.
|
540
|
+
input_ids.clamp_(min=0, max=vocab_size - 1)
|
541
|
+
inputs_embeds = input_embedding(input_ids)
|
542
|
+
|
543
|
+
# 4. scatter embeddings into input embedding
|
544
|
+
for embedding, mask in zip(embeddings, masks):
|
545
|
+
if embedding is None or mask is None:
|
546
|
+
continue
|
547
|
+
mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
548
|
+
inputs_embeds = inputs_embeds.masked_scatter(
|
549
|
+
mask,
|
550
|
+
embedding.to(inputs_embeds.device, inputs_embeds.dtype),
|
551
|
+
)
|
364
552
|
return inputs_embeds
|
365
553
|
|
366
554
|
|
@@ -368,37 +556,53 @@ def general_mm_embed_routine(
|
|
368
556
|
input_ids: torch.Tensor,
|
369
557
|
forward_batch: ForwardBatch,
|
370
558
|
language_model: nn.Module,
|
371
|
-
image_data_embedding_func:
|
372
|
-
[List[MultimodalDataItem]], torch.Tensor
|
559
|
+
image_data_embedding_func: Optional[
|
560
|
+
Callable[[List[MultimodalDataItem]], torch.Tensor]
|
373
561
|
] = None,
|
374
|
-
audio_data_embedding_func:
|
375
|
-
[List[MultimodalDataItem]], torch.Tensor
|
562
|
+
audio_data_embedding_func: Optional[
|
563
|
+
Callable[[List[MultimodalDataItem]], torch.Tensor]
|
376
564
|
] = None,
|
377
|
-
placeholder_tokens: dict[Modality, List[int]] = None,
|
565
|
+
placeholder_tokens: Optional[dict[Modality, List[int]]] = None,
|
378
566
|
**kwargs,
|
379
567
|
) -> torch.Tensor:
|
380
568
|
"""
|
381
|
-
|
382
|
-
|
383
|
-
Args:
|
384
|
-
placeholder_token_ids (List[int]): the ids of mm data placeholder tokens
|
385
|
-
image_data_embedding_func : the function returning the image embedding
|
386
|
-
audio_data_embedding_func : the function returning the image embedding
|
569
|
+
Process multimodal inputs and forward through language model.
|
387
570
|
|
388
|
-
|
389
|
-
|
571
|
+
Args:
|
572
|
+
input_ids: Input token IDs tensor
|
573
|
+
forward_batch: Batch information for model forward pass
|
574
|
+
language_model: Base language model to use
|
575
|
+
image_data_embedding_func: Function to embed image data
|
576
|
+
audio_data_embedding_func: Function to embed audio data
|
577
|
+
placeholder_tokens: Token IDs for multimodal placeholders
|
578
|
+
**kwargs: Additional arguments passed to language model
|
390
579
|
|
580
|
+
Returns:
|
581
|
+
Hidden states from language model forward pass
|
391
582
|
"""
|
392
|
-
|
393
583
|
assert hasattr(language_model, "get_input_embeddings")
|
394
584
|
embed_tokens = language_model.get_input_embeddings()
|
395
585
|
if (
|
396
586
|
not forward_batch.forward_mode.is_decode()
|
397
587
|
and forward_batch.contains_mm_inputs()
|
398
588
|
):
|
399
|
-
|
589
|
+
mm_inputs_list = [
|
590
|
+
mm_input for mm_input in forward_batch.mm_inputs if mm_input is not None
|
591
|
+
]
|
592
|
+
extend_prefix_lens = [
|
593
|
+
prefix_len
|
594
|
+
for i, prefix_len in enumerate(forward_batch.extend_prefix_lens_cpu)
|
595
|
+
if forward_batch.mm_inputs[i] is not None
|
596
|
+
]
|
597
|
+
extend_seq_lens = [
|
598
|
+
seq_len
|
599
|
+
for i, seq_len in enumerate(forward_batch.extend_seq_lens_cpu)
|
600
|
+
if forward_batch.mm_inputs[i] is not None
|
601
|
+
]
|
400
602
|
inputs_embeds = embed_mm_inputs(
|
401
|
-
|
603
|
+
mm_inputs_list=mm_inputs_list,
|
604
|
+
extend_prefix_lens=extend_prefix_lens,
|
605
|
+
extend_seq_lens=extend_seq_lens,
|
402
606
|
input_ids=input_ids,
|
403
607
|
input_embedding=embed_tokens,
|
404
608
|
image_data_embedding_func=image_data_embedding_func,
|