sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -3,32 +3,41 @@ import concurrent.futures
|
|
3
3
|
import dataclasses
|
4
4
|
import multiprocessing as mp
|
5
5
|
import os
|
6
|
+
import re
|
6
7
|
from abc import ABC, abstractmethod
|
7
|
-
from
|
8
|
+
from enum import Enum
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
8
10
|
|
9
11
|
import numpy as np
|
10
|
-
import PIL
|
11
12
|
import torch
|
12
13
|
from PIL import Image
|
13
14
|
from transformers import BaseImageProcessorFast
|
14
15
|
|
15
|
-
from sglang.srt.managers.schedule_batch import Modality
|
16
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
16
17
|
from sglang.srt.utils import encode_video, load_audio, load_image
|
17
18
|
|
18
19
|
|
20
|
+
class MultimodalInputFormat(Enum):
|
21
|
+
"""Enum for different multimodal input formats."""
|
22
|
+
|
23
|
+
RAW_IMAGES = "raw_images"
|
24
|
+
PRECOMPUTED_FEATURES = "precomputed_features"
|
25
|
+
PIXEL_VALUES = "pixel_values"
|
26
|
+
|
27
|
+
|
19
28
|
@dataclasses.dataclass
|
20
29
|
class BaseMultiModalProcessorOutput:
|
21
30
|
# input_text, with each frame of video/image represented with a image_token
|
22
31
|
input_text: str
|
23
32
|
|
24
33
|
# frames loaded from image and video, in given order
|
25
|
-
images: Optional[list[
|
34
|
+
images: Optional[list[Union[Image.Image, dict]]] = None
|
26
35
|
|
27
36
|
# audios
|
28
|
-
audios: Optional[list[np.ndarray]] = None
|
37
|
+
audios: Optional[list[Union[np.ndarray, dict]]] = None
|
29
38
|
|
30
39
|
def normalize(self):
|
31
|
-
for field_name in ["
|
40
|
+
for field_name in ["images", "audios"]:
|
32
41
|
field = getattr(self, field_name, None)
|
33
42
|
if field is not None and isinstance(field, list) and len(field) == 0:
|
34
43
|
setattr(self, field_name, None)
|
@@ -36,16 +45,48 @@ class BaseMultiModalProcessorOutput:
|
|
36
45
|
|
37
46
|
@dataclasses.dataclass
|
38
47
|
class MultimodalSpecialTokens:
|
39
|
-
image_token: Optional[str] = None
|
40
|
-
video_token: Optional[str] = None
|
41
|
-
audio_token: Optional[str] = None
|
42
|
-
|
43
|
-
def
|
44
|
-
|
45
|
-
token
|
46
|
-
|
47
|
-
|
48
|
+
image_token: Optional[Union[int, str, List[str]]] = None
|
49
|
+
video_token: Optional[Union[int, str, List[str]]] = None
|
50
|
+
audio_token: Optional[Union[int, str, List[str]]] = None
|
51
|
+
|
52
|
+
def convert_to_str(self, token: Union[str, int], processor) -> str:
|
53
|
+
if token is None:
|
54
|
+
return token
|
55
|
+
if isinstance(token, str):
|
56
|
+
return token
|
57
|
+
return processor.tokenizer.convert_ids_to_tokens([token])[0]
|
58
|
+
|
59
|
+
def convert_to_strs(self, processor):
|
60
|
+
self.image_token = self.convert_to_str(self.image_token, processor)
|
61
|
+
self.video_token = self.convert_to_str(self.video_token, processor)
|
62
|
+
self.audio_token = self.convert_to_str(self.audio_token, processor)
|
63
|
+
|
64
|
+
image_token_regex: Optional[re.Pattern] = None
|
65
|
+
video_token_regex: Optional[re.Pattern] = None
|
66
|
+
audio_token_regex: Optional[re.Pattern] = None
|
67
|
+
|
68
|
+
def __post_init__(self):
|
69
|
+
if self.image_token_regex is None and self.image_token is not None:
|
70
|
+
self.image_token_regex = re.compile(re.escape(self.image_token))
|
71
|
+
if self.video_token_regex is None and self.video_token is not None:
|
72
|
+
self.video_token_regex = re.compile(re.escape(self.video_token))
|
73
|
+
if self.audio_token_regex is None and self.audio_token is not None:
|
74
|
+
self.audio_token_regex = re.compile(re.escape(self.audio_token))
|
75
|
+
|
76
|
+
def collect(self) -> re.Pattern:
|
77
|
+
tokens = [
|
78
|
+
self.image_token_regex,
|
79
|
+
self.video_token_regex,
|
80
|
+
self.audio_token_regex,
|
48
81
|
]
|
82
|
+
patterns = []
|
83
|
+
flags = 0
|
84
|
+
for t in tokens:
|
85
|
+
if t is not None:
|
86
|
+
patterns.append(t.pattern)
|
87
|
+
flags |= t.flags
|
88
|
+
combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
|
89
|
+
return re.compile(combined, flags)
|
49
90
|
|
50
91
|
|
51
92
|
class BaseMultimodalProcessor(ABC):
|
@@ -54,6 +95,7 @@ class BaseMultimodalProcessor(ABC):
|
|
54
95
|
def __init__(self, hf_config, server_args, _processor):
|
55
96
|
self.hf_config = hf_config
|
56
97
|
self._processor = _processor
|
98
|
+
self.arch = hf_config.architectures[0]
|
57
99
|
self.server_args = server_args
|
58
100
|
# FIXME: not accurate, model and image specific
|
59
101
|
self.NUM_TOKEN_PER_FRAME = 330
|
@@ -136,6 +178,8 @@ class BaseMultimodalProcessor(ABC):
|
|
136
178
|
data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
|
137
179
|
):
|
138
180
|
"""Static method that can be pickled for multiprocessing"""
|
181
|
+
if isinstance(data, dict):
|
182
|
+
return data
|
139
183
|
try:
|
140
184
|
if is_audio:
|
141
185
|
return load_audio(data)
|
@@ -175,7 +219,10 @@ class BaseMultimodalProcessor(ABC):
|
|
175
219
|
image_index, audio_index = 0, 0
|
176
220
|
|
177
221
|
for text_part in text_parts:
|
178
|
-
if
|
222
|
+
if (
|
223
|
+
multimodal_tokens.image_token_regex
|
224
|
+
and multimodal_tokens.image_token_regex.match(text_part)
|
225
|
+
):
|
179
226
|
data = image_data[image_index]
|
180
227
|
is_video = isinstance(data, str) and data.startswith("video:")
|
181
228
|
estimated_frames = estimated_frames_list[image_index]
|
@@ -192,7 +239,10 @@ class BaseMultimodalProcessor(ABC):
|
|
192
239
|
)
|
193
240
|
task_info.append((Modality.IMAGE, data, frame_count_limit))
|
194
241
|
image_index += 1
|
195
|
-
elif
|
242
|
+
elif (
|
243
|
+
multimodal_tokens.audio_token_regex
|
244
|
+
and multimodal_tokens.audio_token_regex.match(text_part)
|
245
|
+
):
|
196
246
|
data = audio_data[audio_index]
|
197
247
|
futures.append(
|
198
248
|
self.io_executor.submit(
|
@@ -228,17 +278,13 @@ class BaseMultimodalProcessor(ABC):
|
|
228
278
|
discard_alpha_channel: if True, discards the alpha channel in the returned images
|
229
279
|
|
230
280
|
"""
|
231
|
-
|
281
|
+
if not return_text:
|
282
|
+
raise NotImplementedError()
|
232
283
|
if image_data is None:
|
233
284
|
image_data = []
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
multimodal_tokens.image_token
|
238
|
-
)
|
239
|
-
)
|
240
|
-
else:
|
241
|
-
multimodal_tokens.image_token = multimodal_tokens.image_token
|
285
|
+
|
286
|
+
multimodal_tokens.convert_to_strs(self._processor)
|
287
|
+
multimodal_tokens_pattern = multimodal_tokens.collect()
|
242
288
|
|
243
289
|
if isinstance(prompt, list) and return_text:
|
244
290
|
assert len(prompt) and isinstance(prompt[0], int)
|
@@ -247,16 +293,8 @@ class BaseMultimodalProcessor(ABC):
|
|
247
293
|
prompt = prompt
|
248
294
|
|
249
295
|
assert isinstance(prompt, str)
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
pattern = (
|
254
|
-
"("
|
255
|
-
+ "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
|
256
|
-
+ ")"
|
257
|
-
)
|
258
|
-
# split text into list of normal text and special tokens
|
259
|
-
text_parts = re.split(pattern, prompt)
|
296
|
+
# split text into list of normal text and special tokens
|
297
|
+
text_parts = re.split(multimodal_tokens_pattern, prompt)
|
260
298
|
|
261
299
|
futures, task_info = self.submit_data_loading_tasks(
|
262
300
|
text_parts=text_parts,
|
@@ -266,34 +304,253 @@ class BaseMultimodalProcessor(ABC):
|
|
266
304
|
discard_alpha_channel=discard_alpha_channel,
|
267
305
|
)
|
268
306
|
# Process results
|
269
|
-
|
307
|
+
images, audios = [], []
|
270
308
|
new_text = ""
|
271
309
|
task_ptr = 0
|
272
310
|
|
273
311
|
for text_part in text_parts:
|
274
|
-
if
|
312
|
+
if multimodal_tokens_pattern.match(text_part):
|
275
313
|
task_type, data, frame_limit = task_info[task_ptr]
|
276
314
|
result = futures[task_ptr].result()
|
277
315
|
task_ptr += 1
|
278
316
|
|
279
317
|
if task_type == Modality.IMAGE:
|
318
|
+
# If data is already processed it will be a
|
319
|
+
# dictionary. In this case we want to keep the
|
320
|
+
# expanded tokens in text_part. Otherwise, we will
|
321
|
+
# call the processor code, so keep only a single image
|
322
|
+
# token.
|
323
|
+
mm_tokens = (
|
324
|
+
text_part
|
325
|
+
if isinstance(data, dict)
|
326
|
+
else multimodal_tokens.image_token
|
327
|
+
)
|
280
328
|
frames = [result] if not isinstance(result, list) else result
|
281
329
|
if frames:
|
282
|
-
image_sizes += frames[0].size * len(frames)
|
283
330
|
images += frames
|
284
|
-
new_text +=
|
331
|
+
new_text += mm_tokens * len(frames)
|
285
332
|
elif task_type == Modality.AUDIO:
|
286
333
|
# audio
|
334
|
+
mm_tokens = (
|
335
|
+
text_part
|
336
|
+
if isinstance(data, dict)
|
337
|
+
else multimodal_tokens.audio_token
|
338
|
+
)
|
287
339
|
audios.append(result)
|
288
|
-
new_text +=
|
340
|
+
new_text += mm_tokens
|
289
341
|
# TODO: handle video
|
290
342
|
else:
|
291
343
|
new_text += text_part
|
292
344
|
|
293
345
|
out = BaseMultiModalProcessorOutput(
|
346
|
+
input_text=new_text,
|
294
347
|
images=images,
|
295
348
|
audios=audios,
|
296
|
-
input_text=new_text,
|
297
349
|
)
|
298
350
|
out.normalize()
|
299
351
|
return out
|
352
|
+
|
353
|
+
@staticmethod
|
354
|
+
def get_mm_items_offset(
|
355
|
+
input_ids: torch.Tensor, mm_token_id: int
|
356
|
+
) -> List[Tuple[int, int]]:
|
357
|
+
"""
|
358
|
+
Get a set of range for mm_items from input_ids
|
359
|
+
Example:
|
360
|
+
input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
|
361
|
+
mm_token_id = 3
|
362
|
+
return result = [(2,4),(6,7)]
|
363
|
+
"""
|
364
|
+
mask = input_ids == mm_token_id
|
365
|
+
|
366
|
+
start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
|
367
|
+
end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
|
368
|
+
|
369
|
+
return list(zip(start_positions.tolist(), end_positions.tolist()))
|
370
|
+
|
371
|
+
@staticmethod
|
372
|
+
def get_mm_items_offset_by_pair(
|
373
|
+
input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
|
374
|
+
) -> List[Tuple[int, int]]:
|
375
|
+
indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
|
376
|
+
indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
|
377
|
+
|
378
|
+
return list(zip(indices_start.tolist(), indices_end.tolist()))
|
379
|
+
|
380
|
+
@staticmethod
|
381
|
+
def _extract_processor_features(
|
382
|
+
items: List[dict], attr_name: str
|
383
|
+
) -> Optional[torch.Tensor]:
|
384
|
+
"""
|
385
|
+
Helper function to concat extracted attributes from processor output.
|
386
|
+
"""
|
387
|
+
values = [value for item in items if (value := item.get(attr_name)) is not None]
|
388
|
+
return torch.cat(values) if values else None
|
389
|
+
|
390
|
+
# When we assume that all the items have the same attributes
|
391
|
+
def _extract_processor_features_from_all_attributes(
|
392
|
+
self, items: List[dict]
|
393
|
+
) -> dict:
|
394
|
+
values = {}
|
395
|
+
# Verify all items have the same keys
|
396
|
+
first_keys = set(items[0].keys())
|
397
|
+
for item in items[1:]:
|
398
|
+
if set(item.keys()) != first_keys:
|
399
|
+
raise ValueError(
|
400
|
+
f"All items must have the same attributes. "
|
401
|
+
f"First item has {first_keys}, but found {set(item.keys())}"
|
402
|
+
)
|
403
|
+
|
404
|
+
# Process each attribute
|
405
|
+
for k, v in items[0].items():
|
406
|
+
if isinstance(v, list):
|
407
|
+
values[k] = self._extract_processor_features(items, k)
|
408
|
+
else:
|
409
|
+
# Verify all items have the same value for non-list attributes
|
410
|
+
for item in items[1:]:
|
411
|
+
if item[k] != v:
|
412
|
+
raise ValueError(
|
413
|
+
f"All items must have the same value for attribute {k}. "
|
414
|
+
f"First item has {v}, but found {item[k]}"
|
415
|
+
)
|
416
|
+
values[k] = v
|
417
|
+
return values
|
418
|
+
|
419
|
+
def process_and_combine_mm_data(
|
420
|
+
self, base_output: BaseMultiModalProcessorOutput
|
421
|
+
) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
|
422
|
+
"""
|
423
|
+
Process multimodal data and return the combined multimodal item and input_ids.
|
424
|
+
Handles all three input formats at the same abstraction level.
|
425
|
+
|
426
|
+
Returns:
|
427
|
+
Tuple of (combined_mm_item, input_ids)
|
428
|
+
"""
|
429
|
+
|
430
|
+
def tokenize_text(input_text: str) -> torch.Tensor:
|
431
|
+
"""Tokenize input text."""
|
432
|
+
return self._processor.tokenizer(
|
433
|
+
input_text,
|
434
|
+
return_tensors="pt",
|
435
|
+
add_special_tokens=True,
|
436
|
+
).input_ids.flatten()
|
437
|
+
|
438
|
+
def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
|
439
|
+
"""Categorize multimodal inputs and validate consistency."""
|
440
|
+
try:
|
441
|
+
has_image = False
|
442
|
+
has_pixel_values = False
|
443
|
+
has_precomputed_features = False
|
444
|
+
|
445
|
+
for mm_input in mm_inputs:
|
446
|
+
if isinstance(mm_input, Image.Image):
|
447
|
+
has_image = True
|
448
|
+
elif isinstance(mm_input, dict):
|
449
|
+
if mm_input.get("precomputed_features", None) is not None:
|
450
|
+
has_precomputed_features = True
|
451
|
+
elif mm_input.get("pixel_values", None) is not None:
|
452
|
+
has_pixel_values = True
|
453
|
+
else:
|
454
|
+
raise ValueError(
|
455
|
+
f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
|
456
|
+
)
|
457
|
+
else:
|
458
|
+
raise ValueError(
|
459
|
+
f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
|
460
|
+
)
|
461
|
+
|
462
|
+
# Validate format consistency
|
463
|
+
format_count = sum(
|
464
|
+
[has_image, has_pixel_values, has_precomputed_features]
|
465
|
+
)
|
466
|
+
if format_count > 1:
|
467
|
+
raise ValueError(
|
468
|
+
"Unsupported: mixture of multimodal input formats. "
|
469
|
+
f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
|
470
|
+
f"precomputed_features={has_precomputed_features}"
|
471
|
+
)
|
472
|
+
|
473
|
+
if has_image:
|
474
|
+
return MultimodalInputFormat.RAW_IMAGES
|
475
|
+
elif has_precomputed_features:
|
476
|
+
return MultimodalInputFormat.PRECOMPUTED_FEATURES
|
477
|
+
elif has_pixel_values:
|
478
|
+
return MultimodalInputFormat.PIXEL_VALUES
|
479
|
+
else:
|
480
|
+
raise ValueError("No valid multimodal input format found")
|
481
|
+
except Exception as e:
|
482
|
+
raise ValueError(f"Failed to categorize inputs: {e}")
|
483
|
+
|
484
|
+
def process_raw_images(
|
485
|
+
base_output: BaseMultiModalProcessorOutput,
|
486
|
+
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
487
|
+
"""Process raw Image.Image objects using transformers processor."""
|
488
|
+
ret = self.process_mm_data(
|
489
|
+
input_text=base_output.input_text,
|
490
|
+
images=base_output.images,
|
491
|
+
)
|
492
|
+
combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
|
493
|
+
|
494
|
+
# Copy all fields from processor output except input_ids
|
495
|
+
for key, value in ret.items():
|
496
|
+
if key != "input_ids" and hasattr(combined_mm_item, key):
|
497
|
+
setattr(combined_mm_item, key, value)
|
498
|
+
|
499
|
+
input_ids = ret["input_ids"].flatten()
|
500
|
+
return combined_mm_item, input_ids
|
501
|
+
|
502
|
+
def process_precomputed_features(
|
503
|
+
base_output: BaseMultiModalProcessorOutput,
|
504
|
+
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
505
|
+
"""Process inputs with precomputed features."""
|
506
|
+
combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
|
507
|
+
combined_mm_item.precomputed_features = self._extract_processor_features(
|
508
|
+
base_output.images, "precomputed_features"
|
509
|
+
)
|
510
|
+
input_ids = tokenize_text(base_output.input_text)
|
511
|
+
return combined_mm_item, input_ids
|
512
|
+
|
513
|
+
def process_pixel_values(
|
514
|
+
base_output: BaseMultiModalProcessorOutput,
|
515
|
+
) -> Tuple[MultimodalDataItem, torch.Tensor]:
|
516
|
+
"""Process inputs with pixel values."""
|
517
|
+
values = self._extract_processor_features_from_all_attributes(
|
518
|
+
base_output.images
|
519
|
+
)
|
520
|
+
combined_mm_item = MultimodalDataItem.from_dict(values)
|
521
|
+
input_ids = tokenize_text(base_output.input_text)
|
522
|
+
return combined_mm_item, input_ids
|
523
|
+
|
524
|
+
def finalize_mm_item(
|
525
|
+
combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
|
526
|
+
) -> MultimodalDataItem:
|
527
|
+
"""Apply common post-processing to the multimodal item."""
|
528
|
+
combined_mm_item.image_offsets = self.get_mm_items_offset(
|
529
|
+
input_ids=input_ids,
|
530
|
+
mm_token_id=self.IM_TOKEN_ID,
|
531
|
+
)
|
532
|
+
return combined_mm_item
|
533
|
+
|
534
|
+
# Main logic
|
535
|
+
mm_inputs = base_output.images
|
536
|
+
if not mm_inputs:
|
537
|
+
# Return text-only case
|
538
|
+
input_ids = tokenize_text(base_output.input_text)
|
539
|
+
return None, input_ids
|
540
|
+
|
541
|
+
# Categorize input formats
|
542
|
+
input_format = categorize_mm_inputs(mm_inputs)
|
543
|
+
|
544
|
+
# Process based on format
|
545
|
+
if input_format == MultimodalInputFormat.RAW_IMAGES:
|
546
|
+
combined_mm_item, input_ids = process_raw_images(base_output)
|
547
|
+
elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
|
548
|
+
combined_mm_item, input_ids = process_precomputed_features(base_output)
|
549
|
+
elif input_format == MultimodalInputFormat.PIXEL_VALUES:
|
550
|
+
combined_mm_item, input_ids = process_pixel_values(base_output)
|
551
|
+
else:
|
552
|
+
raise ValueError(f"Unknown input format: {input_format}")
|
553
|
+
|
554
|
+
# Finalize with common processing
|
555
|
+
combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
|
556
|
+
return combined_mm_item, input_ids
|
@@ -70,8 +70,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
70
70
|
batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
|
71
71
|
|
72
72
|
items = []
|
73
|
+
input_ids = res["input_ids"]
|
74
|
+
image_offsets = self.get_mm_items_offset(
|
75
|
+
input_ids=input_ids, mm_token_id=self._processor.image_token_id
|
76
|
+
)
|
73
77
|
item = MultimodalDataItem(
|
74
78
|
pixel_values=res["images"],
|
79
|
+
image_offsets=image_offsets,
|
75
80
|
modality=Modality.IMAGE,
|
76
81
|
image_emb_mask=images_seq_mask,
|
77
82
|
image_spatial_crop=batched_images_spatial_crop,
|
@@ -80,6 +85,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
|
80
85
|
|
81
86
|
return {
|
82
87
|
"mm_items": items,
|
83
|
-
"input_ids":
|
88
|
+
"input_ids": input_ids.tolist(),
|
84
89
|
"im_token_id": self._processor.image_token_id,
|
85
90
|
}
|
@@ -1,4 +1,5 @@
|
|
1
|
-
|
1
|
+
import re
|
2
|
+
from typing import Dict, List, Union
|
2
3
|
|
3
4
|
from sglang.srt.managers.multimodal_processor import (
|
4
5
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
@@ -18,13 +19,19 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
18
19
|
|
19
20
|
def __init__(self, hf_config, server_args, _processor):
|
20
21
|
super().__init__(hf_config, server_args, _processor)
|
22
|
+
# The single, pre-expanded image token.
|
21
23
|
self.IMAGE_TOKEN = "<start_of_image>"
|
24
|
+
# The regex that matches expanded image tokens.
|
25
|
+
self.IMAGE_TOKEN_REGEX = re.compile(
|
26
|
+
r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
|
27
|
+
)
|
22
28
|
self.IM_START_TOKEN_ID = hf_config.boi_token_index
|
23
29
|
self.IM_END_TOKEN_ID = hf_config.eoi_token_index
|
30
|
+
self.IM_TOKEN_ID = hf_config.image_token_index
|
24
31
|
|
25
32
|
async def process_mm_data_async(
|
26
33
|
self,
|
27
|
-
image_data: List[Union[str, bytes]],
|
34
|
+
image_data: List[Union[str, bytes, Dict]],
|
28
35
|
input_text,
|
29
36
|
request_obj,
|
30
37
|
max_req_input_len,
|
@@ -36,30 +43,21 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
|
|
36
43
|
if isinstance(image_data, str):
|
37
44
|
image_data = [image_data]
|
38
45
|
|
39
|
-
image_token = self.IMAGE_TOKEN
|
40
46
|
base_output = self.load_mm_data(
|
41
47
|
prompt=input_text,
|
42
48
|
image_data=image_data,
|
43
|
-
multimodal_tokens=MultimodalSpecialTokens(
|
49
|
+
multimodal_tokens=MultimodalSpecialTokens(
|
50
|
+
image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
|
51
|
+
),
|
44
52
|
max_req_input_len=max_req_input_len,
|
45
53
|
discard_alpha_channel=True,
|
46
54
|
)
|
47
55
|
|
48
|
-
|
49
|
-
input_text=base_output.input_text, images=base_output.images
|
50
|
-
)
|
51
|
-
|
52
|
-
items = []
|
53
|
-
for i, image in enumerate(base_output.images):
|
54
|
-
item = MultimodalDataItem(
|
55
|
-
pixel_values=ret["pixel_values"][i],
|
56
|
-
modality=Modality.IMAGE,
|
57
|
-
)
|
58
|
-
items += [item]
|
56
|
+
combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
|
59
57
|
|
60
58
|
return {
|
61
|
-
"
|
62
|
-
"
|
59
|
+
"input_ids": input_ids.tolist(),
|
60
|
+
"mm_items": [combined_mm_item] if combined_mm_item is not None else [],
|
63
61
|
"im_start_id": self.IM_START_TOKEN_ID,
|
64
62
|
"im_end_id": self.IM_END_TOKEN_ID,
|
65
63
|
}
|
@@ -3,7 +3,6 @@
|
|
3
3
|
import numpy as np
|
4
4
|
import torch
|
5
5
|
from decord import VideoReader, cpu
|
6
|
-
from numpy.distutils.cpuinfo import cpu
|
7
6
|
from PIL import Image
|
8
7
|
|
9
8
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
@@ -176,6 +175,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
176
175
|
if not image_data:
|
177
176
|
return None
|
178
177
|
|
178
|
+
# Ensure image_data is a list
|
179
|
+
if isinstance(image_data, str):
|
180
|
+
image_data = [image_data]
|
181
|
+
|
179
182
|
base_output = self.load_mm_data(
|
180
183
|
prompt=input_text,
|
181
184
|
image_data=image_data,
|
@@ -210,7 +213,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
210
213
|
return None
|
211
214
|
|
212
215
|
pixel_values = torch.cat(pixel_values, dim=0)
|
213
|
-
items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
|
214
216
|
|
215
217
|
for idx, num_patches in enumerate(num_patches_list):
|
216
218
|
image_tokens = (
|
@@ -221,10 +223,21 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
|
221
223
|
input_text = input_text.replace("<image>", image_tokens, 1)
|
222
224
|
|
223
225
|
tokenizer = self._processor
|
226
|
+
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
|
227
|
+
image_offsets = self.get_mm_items_offset(
|
228
|
+
input_ids=input_ids,
|
229
|
+
mm_token_id=self.img_context_token_id,
|
230
|
+
)
|
231
|
+
items = [
|
232
|
+
MultimodalDataItem(
|
233
|
+
pixel_values=pixel_values,
|
234
|
+
modality=Modality.IMAGE,
|
235
|
+
image_offsets=image_offsets,
|
236
|
+
)
|
237
|
+
]
|
238
|
+
|
224
239
|
return {
|
225
|
-
"input_ids":
|
226
|
-
.flatten()
|
227
|
-
.tolist(),
|
240
|
+
"input_ids": input_ids.tolist(),
|
228
241
|
"mm_items": items,
|
229
242
|
"im_start_id": self.img_start_token_id,
|
230
243
|
"im_end_id": self.img_end_token_id,
|
@@ -45,15 +45,21 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
|
45
45
|
prompt=base_out.input_text,
|
46
46
|
images=images,
|
47
47
|
)
|
48
|
+
|
49
|
+
input_ids = res["input_ids"].flatten()
|
50
|
+
image_offsets = self.get_mm_items_offset(
|
51
|
+
input_ids=input_ids, mm_token_id=processor.image_id
|
52
|
+
)
|
48
53
|
return {
|
49
54
|
"mm_items": [
|
50
55
|
MultimodalDataItem(
|
51
56
|
pixel_values=res["pixel_values"],
|
52
57
|
image_emb_mask=res["images_emb_mask"],
|
58
|
+
image_offsets=image_offsets,
|
53
59
|
modality=Modality.IMAGE,
|
54
60
|
)
|
55
61
|
],
|
56
|
-
"input_ids":
|
62
|
+
"input_ids": input_ids.tolist(),
|
57
63
|
"im_start_id": processor.image_start_id,
|
58
64
|
"im_end_id": processor.image_end_id,
|
59
65
|
"im_token_id": processor.image_id,
|
@@ -1,9 +1,7 @@
|
|
1
|
-
import
|
2
|
-
import
|
3
|
-
from typing import List, Union
|
1
|
+
import re
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
4
3
|
|
5
4
|
import torch
|
6
|
-
from PIL import Image
|
7
5
|
|
8
6
|
from sglang.srt.managers.multimodal_processors.base_processor import (
|
9
7
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
@@ -22,20 +20,12 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
22
20
|
def __init__(self, hf_config, server_args, _processor):
|
23
21
|
super().__init__(hf_config, server_args, _processor)
|
24
22
|
self.IMAGE_TOKEN = "<|media_pad|>"
|
25
|
-
self.
|
26
|
-
|
27
|
-
self.im_start = "<|media_start|>"
|
28
|
-
self.im_start_id = _processor.tokenizer.convert_tokens_to_ids(self.im_start)
|
29
|
-
|
30
|
-
self.im_end = "<|media_end|>"
|
31
|
-
self.im_end_id = _processor.tokenizer.convert_tokens_to_ids(self.im_end)
|
32
|
-
|
33
|
-
self.im_content = "<|media_content|>"
|
34
|
-
self.im_content_id = _processor.tokenizer.convert_tokens_to_ids(self.im_content)
|
23
|
+
self.IMAGE_TOKEN_REGEX = re.compile(r"(?:<\|media_pad\|>)+")
|
24
|
+
self.IM_TOKEN_ID = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
|
35
25
|
|
36
26
|
async def process_mm_data_async(
|
37
27
|
self,
|
38
|
-
image_data: List[Union[str, bytes]],
|
28
|
+
image_data: List[Union[str, bytes, Dict]],
|
39
29
|
input_text,
|
40
30
|
request_obj,
|
41
31
|
max_req_input_len,
|
@@ -50,24 +40,16 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
|
|
50
40
|
base_output = self.load_mm_data(
|
51
41
|
prompt=input_text,
|
52
42
|
image_data=image_data,
|
53
|
-
multimodal_tokens=MultimodalSpecialTokens(
|
43
|
+
multimodal_tokens=MultimodalSpecialTokens(
|
44
|
+
image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
|
45
|
+
),
|
54
46
|
max_req_input_len=max_req_input_len,
|
55
47
|
)
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
)
|
48
|
+
|
49
|
+
combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
|
50
|
+
|
60
51
|
return {
|
61
|
-
"input_ids":
|
62
|
-
"mm_items": [
|
63
|
-
|
64
|
-
pixel_values=ret["pixel_values"],
|
65
|
-
image_grid_thws=ret["image_grid_hws"],
|
66
|
-
modality=Modality.IMAGE,
|
67
|
-
)
|
68
|
-
],
|
69
|
-
"im_token_id": self.im_token_id,
|
70
|
-
"im_start_id": self.im_start_id,
|
71
|
-
"im_end_id": self.im_end_id,
|
72
|
-
"im_content_id": self.im_content_id,
|
52
|
+
"input_ids": input_ids.tolist(),
|
53
|
+
"mm_items": [combined_mm_item] if combined_mm_item is not None else [],
|
54
|
+
"im_token_id": self.IM_TOKEN_ID,
|
73
55
|
}
|