sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/models/utils.py
CHANGED
|
@@ -27,7 +27,11 @@ if _is_cuda:
|
|
|
27
27
|
|
|
28
28
|
def enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
|
|
29
29
|
"""Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
|
|
30
|
-
return
|
|
30
|
+
return (
|
|
31
|
+
_is_cuda
|
|
32
|
+
and hasattr(forward_batch.token_to_kv_pool, "dtype")
|
|
33
|
+
and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
|
|
34
|
+
)
|
|
31
35
|
|
|
32
36
|
|
|
33
37
|
def create_fused_set_kv_buffer_arg(
|
|
@@ -49,3 +53,9 @@ def create_fused_set_kv_buffer_arg(
|
|
|
49
53
|
v_scale=layer.v_scale,
|
|
50
54
|
cache_loc=forward_batch.out_cache_loc,
|
|
51
55
|
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def permute_inv(perm: torch.Tensor) -> torch.Tensor:
|
|
59
|
+
inv_perm = torch.empty_like(perm)
|
|
60
|
+
inv_perm[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
|
|
61
|
+
return inv_perm
|
|
@@ -155,7 +155,6 @@ class BaseMultimodalProcessor(ABC):
|
|
|
155
155
|
):
|
|
156
156
|
self.hf_config = hf_config
|
|
157
157
|
self._processor = _processor
|
|
158
|
-
self.arch = hf_config.architectures[0]
|
|
159
158
|
self.server_args = server_args
|
|
160
159
|
self.transport_mode = transport_mode
|
|
161
160
|
|
|
@@ -179,6 +178,7 @@ class BaseMultimodalProcessor(ABC):
|
|
|
179
178
|
"image_attention_mask": Modality.IMAGE,
|
|
180
179
|
"image_emb_mask": Modality.IMAGE,
|
|
181
180
|
"images_spatial_crop": Modality.IMAGE,
|
|
181
|
+
"images_crop": Modality.IMAGE,
|
|
182
182
|
"tgt_size": Modality.IMAGE,
|
|
183
183
|
"image_grid_hws": Modality.IMAGE,
|
|
184
184
|
"aspect_ratio_ids": Modality.IMAGE,
|
|
@@ -191,6 +191,7 @@ class BaseMultimodalProcessor(ABC):
|
|
|
191
191
|
"input_features": Modality.AUDIO,
|
|
192
192
|
"input_features_mask": Modality.AUDIO,
|
|
193
193
|
"audio_attention_mask": Modality.AUDIO,
|
|
194
|
+
"feature_attention_mask": Modality.AUDIO,
|
|
194
195
|
# Video-related attributes
|
|
195
196
|
"pixel_values_videos": Modality.VIDEO,
|
|
196
197
|
"second_per_grid_ts": Modality.VIDEO,
|
|
@@ -222,6 +223,7 @@ class BaseMultimodalProcessor(ABC):
|
|
|
222
223
|
if self._processor.__class__.__name__ in {
|
|
223
224
|
"Gemma3nProcessor",
|
|
224
225
|
"Qwen2AudioProcessor",
|
|
226
|
+
"Qwen3OmniMoeProcessor",
|
|
225
227
|
}:
|
|
226
228
|
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
|
|
227
229
|
kwargs["audio"] = audios
|
|
@@ -312,7 +314,9 @@ class BaseMultimodalProcessor(ABC):
|
|
|
312
314
|
try:
|
|
313
315
|
if modality == Modality.IMAGE:
|
|
314
316
|
img, _ = load_image(data)
|
|
315
|
-
|
|
317
|
+
if discard_alpha_channel and img.mode != "RGB":
|
|
318
|
+
img = img.convert("RGB")
|
|
319
|
+
return img
|
|
316
320
|
elif modality == Modality.VIDEO:
|
|
317
321
|
return load_video(data, frame_count_limit)
|
|
318
322
|
elif modality == Modality.AUDIO:
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import List, Union
|
|
2
|
+
|
|
3
|
+
from sglang.srt.models.deepseek_ocr import DeepseekOCRForCausalLM
|
|
4
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
|
5
|
+
BaseMultimodalProcessor,
|
|
6
|
+
MultimodalSpecialTokens,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DeepseekOCRProcessor(BaseMultimodalProcessor):
|
|
11
|
+
models = [DeepseekOCRForCausalLM]
|
|
12
|
+
|
|
13
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
|
14
|
+
_processor.image_size = 640
|
|
15
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
|
16
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
|
17
|
+
image_token="<image>", image_token_id=self._processor.image_token_id
|
|
18
|
+
).build(_processor)
|
|
19
|
+
|
|
20
|
+
async def process_mm_data_async(
|
|
21
|
+
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
|
22
|
+
):
|
|
23
|
+
base_output = self.load_mm_data(
|
|
24
|
+
prompt=input_text,
|
|
25
|
+
multimodal_tokens=self.mm_tokens,
|
|
26
|
+
image_data=image_data,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
|
30
|
+
base_output, self.mm_tokens
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
"input_ids": input_ids.tolist(),
|
|
35
|
+
"mm_items": mm_items,
|
|
36
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
|
37
|
+
}
|
|
@@ -18,9 +18,6 @@
|
|
|
18
18
|
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
19
19
|
from typing import List, Union
|
|
20
20
|
|
|
21
|
-
import torch
|
|
22
|
-
|
|
23
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
24
21
|
from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
|
|
25
22
|
from sglang.srt.multimodal.processors.base_processor import (
|
|
26
23
|
BaseMultimodalProcessor,
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import re
|
|
2
1
|
from typing import List, Union
|
|
3
2
|
|
|
4
3
|
from decord import VideoReader
|
|
@@ -9,10 +8,7 @@ from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
|
|
|
9
8
|
from sglang.srt.multimodal.processors.base_processor import (
|
|
10
9
|
BaseMultimodalProcessor as SGLangBaseProcessor,
|
|
11
10
|
)
|
|
12
|
-
from sglang.srt.multimodal.processors.base_processor import
|
|
13
|
-
BaseMultiModalProcessorOutput,
|
|
14
|
-
MultimodalSpecialTokens,
|
|
15
|
-
)
|
|
11
|
+
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
|
|
16
12
|
|
|
17
13
|
|
|
18
14
|
class Glm4vImageProcessor(SGLangBaseProcessor):
|
|
@@ -4,10 +4,8 @@ from functools import lru_cache
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import torch
|
|
7
|
-
import torchvision.transforms as T
|
|
8
7
|
from decord import VideoReader, cpu, gpu
|
|
9
8
|
from PIL import Image
|
|
10
|
-
from torchvision.transforms import InterpolationMode
|
|
11
9
|
|
|
12
10
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
13
11
|
from sglang.srt.models.interns1 import InternS1ForConditionalGeneration
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from typing import List, Union
|
|
2
2
|
|
|
3
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
4
3
|
from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
|
|
5
4
|
from sglang.srt.multimodal.processors.base_processor import (
|
|
6
5
|
BaseMultimodalProcessor,
|
|
@@ -1,13 +1,5 @@
|
|
|
1
1
|
from typing import List, Union
|
|
2
2
|
|
|
3
|
-
import torch
|
|
4
|
-
from transformers.image_utils import SizeDict
|
|
5
|
-
from transformers.models.llama4.image_processing_llama4_fast import (
|
|
6
|
-
find_supported_resolutions,
|
|
7
|
-
get_best_fit,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
11
3
|
from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
|
|
12
4
|
from sglang.srt.multimodal.processors.base_processor import (
|
|
13
5
|
BaseMultimodalProcessor,
|
|
@@ -3,7 +3,6 @@ from typing import List, Union
|
|
|
3
3
|
|
|
4
4
|
from transformers.processing_utils import ProcessorMixin
|
|
5
5
|
|
|
6
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
|
7
6
|
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
|
8
7
|
from sglang.srt.multimodal.processors.base_processor import (
|
|
9
8
|
BaseMultimodalProcessor,
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Copy from qwen_vl.py, adapted for points-v15-chat
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
|
|
9
|
+
from sglang.srt.multimodal.processors.qwen_vl import (
|
|
10
|
+
Qwen2_5VLImageProcessor,
|
|
11
|
+
resize_image_async,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class POINTSV15ChatProcessor(Qwen2_5VLImageProcessor):
|
|
16
|
+
models = [POINTSV15ChatModel]
|
|
17
|
+
|
|
18
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
|
19
|
+
# Compatible with POINTSV15Chat
|
|
20
|
+
hf_config.vision_start_token_id = None
|
|
21
|
+
hf_config.vision_end_token_id = None
|
|
22
|
+
hf_config.video_token_id = None
|
|
23
|
+
|
|
24
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
|
25
|
+
|
|
26
|
+
async def process_mm_data_async(
|
|
27
|
+
self,
|
|
28
|
+
image_data: List[Union[str, bytes]],
|
|
29
|
+
input_text,
|
|
30
|
+
request_obj,
|
|
31
|
+
*args,
|
|
32
|
+
**kwargs,
|
|
33
|
+
):
|
|
34
|
+
base_output = self.load_mm_data(
|
|
35
|
+
prompt=input_text,
|
|
36
|
+
image_data=image_data,
|
|
37
|
+
multimodal_tokens=self.mm_tokens,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if base_output.images and isinstance(base_output.images[0], Image.Image):
|
|
41
|
+
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
|
42
|
+
base_output.images = await asyncio.gather(*resize_tasks)
|
|
43
|
+
|
|
44
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
|
45
|
+
base_output, self.mm_tokens
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
"input_ids": input_ids.tolist(),
|
|
50
|
+
"mm_items": mm_items,
|
|
51
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
|
52
|
+
}
|
|
@@ -9,9 +9,11 @@ import torchvision
|
|
|
9
9
|
from PIL import Image
|
|
10
10
|
from torchvision.transforms import InterpolationMode
|
|
11
11
|
|
|
12
|
+
from sglang.srt.environ import envs
|
|
12
13
|
from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
|
|
13
14
|
from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
|
|
14
15
|
from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
|
|
16
|
+
from sglang.srt.models.qwen3_omni_moe import Qwen3OmniMoeForConditionalGeneration
|
|
15
17
|
from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
|
|
16
18
|
from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
|
|
17
19
|
from sglang.srt.multimodal.processors.base_processor import (
|
|
@@ -22,8 +24,14 @@ from sglang.utils import logger
|
|
|
22
24
|
|
|
23
25
|
IMAGE_FACTOR = 28
|
|
24
26
|
MIN_PIXELS = 4 * 28 * 28
|
|
25
|
-
MAX_PIXELS =
|
|
27
|
+
MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
|
|
26
28
|
MAX_RATIO = 200
|
|
29
|
+
RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
|
|
30
|
+
if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
|
|
31
|
+
logger.warning(
|
|
32
|
+
f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
|
|
33
|
+
f"Ignoring and using default."
|
|
34
|
+
)
|
|
27
35
|
VIDEO_TOTAL_PIXELS = int(
|
|
28
36
|
float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
|
|
29
37
|
)
|
|
@@ -85,7 +93,7 @@ def resize_image(
|
|
|
85
93
|
min_pixels=min_pixels,
|
|
86
94
|
max_pixels=max_pixels,
|
|
87
95
|
)
|
|
88
|
-
image = image.resize((resized_width, resized_height))
|
|
96
|
+
image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
|
|
89
97
|
return image
|
|
90
98
|
|
|
91
99
|
|
|
@@ -206,25 +214,41 @@ async def preprocess_video(
|
|
|
206
214
|
interpolation=InterpolationMode.BICUBIC,
|
|
207
215
|
antialias=True,
|
|
208
216
|
).float()
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
217
|
+
video_metadata = {
|
|
218
|
+
"fps": video_fps,
|
|
219
|
+
"duration": total_frames / video_fps,
|
|
220
|
+
"total_num_frames": total_frames,
|
|
221
|
+
"frames_indices": idx,
|
|
222
|
+
"video_backend": "torchvision",
|
|
223
|
+
}
|
|
224
|
+
return video, video_metadata
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# Compatible with Qwen-VL & Qwen-Omni Series
|
|
228
|
+
class QwenVLImageProcessor(SGLangBaseProcessor):
|
|
214
229
|
models = [
|
|
215
230
|
Qwen2VLForConditionalGeneration,
|
|
216
231
|
Qwen2_5_VLForConditionalGeneration,
|
|
217
232
|
Qwen3VLForConditionalGeneration,
|
|
218
233
|
Qwen3VLMoeForConditionalGeneration,
|
|
234
|
+
Qwen3OmniMoeForConditionalGeneration,
|
|
219
235
|
]
|
|
220
236
|
|
|
221
237
|
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
|
238
|
+
self.model_type = hf_config.model_type
|
|
239
|
+
if hf_config.model_type == "qwen3_omni_moe":
|
|
240
|
+
hf_config = hf_config.thinker_config
|
|
241
|
+
|
|
222
242
|
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
|
223
|
-
|
|
243
|
+
|
|
224
244
|
self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
|
|
225
245
|
self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
|
|
226
246
|
self.vision_start_token_id = hf_config.vision_start_token_id
|
|
227
247
|
self.vision_end_token_id = hf_config.vision_end_token_id
|
|
248
|
+
|
|
249
|
+
self.audio_start_token_id = getattr(hf_config, "audio_start_token_id", None)
|
|
250
|
+
self.audio_token_id = getattr(hf_config, "audio_token_id", None)
|
|
251
|
+
|
|
228
252
|
self.NUM_TOKEN_PER_FRAME = 770
|
|
229
253
|
self.IMAGE_FACTOR = 28
|
|
230
254
|
self.MIN_PIXELS = 4 * 28 * 28
|
|
@@ -233,10 +257,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
|
233
257
|
self.mm_tokens = MultimodalSpecialTokens(
|
|
234
258
|
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
|
235
259
|
image_token_id=hf_config.image_token_id,
|
|
260
|
+
# The regex that matches expanded image tokens.
|
|
236
261
|
image_token_regex=re.compile(
|
|
237
262
|
r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
|
|
238
263
|
),
|
|
239
264
|
video_token_id=hf_config.video_token_id,
|
|
265
|
+
audio_token_id=self.audio_token_id,
|
|
240
266
|
).build(_processor)
|
|
241
267
|
|
|
242
268
|
async def process_mm_data_async(
|
|
@@ -247,11 +273,11 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
|
247
273
|
*args,
|
|
248
274
|
**kwargs,
|
|
249
275
|
):
|
|
250
|
-
|
|
251
276
|
base_output = self.load_mm_data(
|
|
252
277
|
prompt=input_text,
|
|
253
278
|
image_data=image_data,
|
|
254
279
|
video_data=request_obj.video_data,
|
|
280
|
+
audio_data=request_obj.audio_data,
|
|
255
281
|
multimodal_tokens=self.mm_tokens,
|
|
256
282
|
)
|
|
257
283
|
|
|
@@ -260,29 +286,61 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
|
260
286
|
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
|
261
287
|
base_output.images = await asyncio.gather(*resize_tasks)
|
|
262
288
|
|
|
289
|
+
video_metadata = None
|
|
263
290
|
if base_output.videos:
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
291
|
+
video_results = await asyncio.gather(
|
|
292
|
+
*[preprocess_video(video) for video in base_output.videos]
|
|
293
|
+
)
|
|
294
|
+
base_output.videos, video_metadata = map(list, zip(*video_results))
|
|
295
|
+
|
|
296
|
+
# NOTE: for qwen3-vl, video_meta need to be passed in, since do_sample_frames is already done in preprocess_video
|
|
297
|
+
if self.hf_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
|
|
298
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
|
299
|
+
base_output,
|
|
300
|
+
self.mm_tokens,
|
|
301
|
+
video_metadata=video_metadata,
|
|
302
|
+
do_sample_frames=False,
|
|
303
|
+
)
|
|
304
|
+
else:
|
|
305
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
|
306
|
+
base_output, self.mm_tokens
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
audio_feature_lengths = None
|
|
267
310
|
|
|
268
|
-
|
|
269
|
-
|
|
311
|
+
if self.model_type == "qwen3_omni_moe":
|
|
312
|
+
audio_item = next((mm for mm in mm_items if mm.is_audio()), None)
|
|
313
|
+
if audio_item:
|
|
314
|
+
audio_feature_lengths = torch.sum(
|
|
315
|
+
audio_item.feature_attention_mask, dim=1
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
second_per_grid_ts = getattr(ret, "second_per_grid_ts", None) or getattr(
|
|
319
|
+
ret, "video_second_per_grid", None
|
|
270
320
|
)
|
|
271
321
|
|
|
272
322
|
input_ids = input_ids.flatten()
|
|
323
|
+
|
|
273
324
|
mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
|
|
274
325
|
spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
|
|
275
326
|
image_token_id=self.mm_tokens.image_token_id,
|
|
276
327
|
video_token_id=self.mm_tokens.video_token_id,
|
|
277
328
|
vision_start_token_id=self.vision_start_token_id,
|
|
278
|
-
model_type=self.
|
|
329
|
+
model_type=self.model_type,
|
|
279
330
|
tokens_per_second=getattr(
|
|
280
331
|
self.hf_config.vision_config, "tokens_per_second", None
|
|
281
332
|
),
|
|
282
333
|
input_ids=input_ids.unsqueeze(0),
|
|
283
334
|
image_grid_thw=getattr(ret, "image_grid_thw", None),
|
|
284
335
|
video_grid_thw=getattr(ret, "video_grid_thw", None),
|
|
285
|
-
second_per_grid_ts=
|
|
336
|
+
second_per_grid_ts=second_per_grid_ts,
|
|
337
|
+
use_audio_in_video=False,
|
|
338
|
+
audio_seqlens=audio_feature_lengths,
|
|
339
|
+
audio_token_id=getattr(self.hf_config, "audio_token_id", None),
|
|
340
|
+
audio_start_token_id=self.audio_start_token_id,
|
|
341
|
+
position_id_per_seconds=getattr(
|
|
342
|
+
self.hf_config, "position_id_per_seconds", None
|
|
343
|
+
),
|
|
286
344
|
)
|
|
287
345
|
mrope_positions = mrope_positions.squeeze(1)
|
|
288
346
|
|
|
@@ -293,6 +351,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
|
293
351
|
"im_end_id": self.IM_END_TOKEN_ID,
|
|
294
352
|
"im_token_id": self.mm_tokens.image_token_id,
|
|
295
353
|
"video_token_id": self.mm_tokens.video_token_id,
|
|
354
|
+
"audio_token_id": self.mm_tokens.audio_token_id,
|
|
296
355
|
"mrope_positions": mrope_positions,
|
|
297
356
|
"mrope_position_delta": mrope_position_delta,
|
|
298
357
|
}
|
|
@@ -838,6 +838,19 @@ register_conv_template(
|
|
|
838
838
|
)
|
|
839
839
|
)
|
|
840
840
|
|
|
841
|
+
register_conv_template(
|
|
842
|
+
Conversation(
|
|
843
|
+
name="deepseek-ocr",
|
|
844
|
+
system_message="",
|
|
845
|
+
system_template="",
|
|
846
|
+
roles=("", ""),
|
|
847
|
+
sep="",
|
|
848
|
+
sep_style=SeparatorStyle.NO_COLON_SINGLE,
|
|
849
|
+
stop_str=["<|end▁of▁sentence|>"],
|
|
850
|
+
image_token="<image>",
|
|
851
|
+
)
|
|
852
|
+
)
|
|
853
|
+
|
|
841
854
|
register_conv_template(
|
|
842
855
|
Conversation(
|
|
843
856
|
name="deepseek-vl2",
|
|
@@ -960,6 +973,19 @@ register_conv_template(
|
|
|
960
973
|
)
|
|
961
974
|
)
|
|
962
975
|
|
|
976
|
+
register_conv_template(
|
|
977
|
+
Conversation(
|
|
978
|
+
name="points-v15-chat",
|
|
979
|
+
system_message="",
|
|
980
|
+
system_template="",
|
|
981
|
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
|
982
|
+
sep="<|im_end|>\n",
|
|
983
|
+
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
|
|
984
|
+
stop_str=["<|im_end|>"],
|
|
985
|
+
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
|
986
|
+
video_token="<|vision_start|><|video_pad|><|vision_end|>",
|
|
987
|
+
)
|
|
988
|
+
)
|
|
963
989
|
|
|
964
990
|
MODEL_TYPE_TO_TEMPLATE = {
|
|
965
991
|
"internvl_chat": "internvl-2-5",
|
|
@@ -968,9 +994,16 @@ MODEL_TYPE_TO_TEMPLATE = {
|
|
|
968
994
|
"phi4mm": "phi-4-mm",
|
|
969
995
|
"minicpmv": "minicpmv",
|
|
970
996
|
"minicpmo": "minicpmo",
|
|
997
|
+
"deepseek-ocr": "deepseek-ocr",
|
|
971
998
|
}
|
|
972
999
|
|
|
973
1000
|
|
|
1001
|
+
@register_conv_template_matching_function
|
|
1002
|
+
def match_points_v15_chat(model_path: str):
|
|
1003
|
+
if re.search(r"points", model_path, re.IGNORECASE):
|
|
1004
|
+
return "points-v15-chat"
|
|
1005
|
+
|
|
1006
|
+
|
|
974
1007
|
def get_model_type(model_path: str) -> Optional[str]:
|
|
975
1008
|
config_path = os.path.join(model_path, "config.json")
|
|
976
1009
|
if not os.path.exists(config_path):
|
|
@@ -1038,3 +1071,11 @@ def match_phi_4_mm(model_path: str):
|
|
|
1038
1071
|
return "phi-4-mm"
|
|
1039
1072
|
model_type = get_model_type(model_path)
|
|
1040
1073
|
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
@register_conv_template_matching_function
|
|
1077
|
+
def match_deepseek_ocr(model_path: str):
|
|
1078
|
+
if "deepseek-ocr" in model_path.lower():
|
|
1079
|
+
return "deepseek-ocr"
|
|
1080
|
+
model_type = get_model_type(model_path)
|
|
1081
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from functools import lru_cache
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import dill
|
|
7
|
+
import orjson
|
|
7
8
|
import torch
|
|
8
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from sglang.srt.managers.schedule_batch import Req
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
@lru_cache(maxsize=None)
|
|
11
15
|
def _cache_from_str(json_str: str):
|
|
12
16
|
"""Deserialize a json string to a Callable object.
|
|
13
17
|
This function is cached to avoid redundant deserialization.
|
|
14
18
|
"""
|
|
15
|
-
data =
|
|
19
|
+
data = orjson.loads(json_str)
|
|
16
20
|
return dill.loads(bytes.fromhex(data["callable"]))
|
|
17
21
|
|
|
18
22
|
|
|
@@ -51,3 +55,74 @@ class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
|
|
|
51
55
|
), f"{custom_param_list=}"
|
|
52
56
|
logits[..., disallowed_token_ids] = -float("inf")
|
|
53
57
|
return logits
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ThinkingBudgetLogitProcessor(CustomLogitProcessor):
|
|
61
|
+
"""A logit processor that controls the length of thinking."""
|
|
62
|
+
|
|
63
|
+
THINKING_START_TOKEN_ID: int
|
|
64
|
+
THINKING_END_TOKEN_ID: int
|
|
65
|
+
NEW_LINE_TOKEN_ID: int
|
|
66
|
+
|
|
67
|
+
def __call__(self, logits, custom_param_list: list[dict[str, Any]]):
|
|
68
|
+
if custom_param_list is None or not custom_param_list:
|
|
69
|
+
return logits
|
|
70
|
+
for i, param_dict in enumerate(custom_param_list):
|
|
71
|
+
if param_dict is None:
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
thinking_budget: int | None = param_dict.get("thinking_budget")
|
|
75
|
+
|
|
76
|
+
# Skip if thinking_budget is unset, or not an integer, or negative
|
|
77
|
+
if (
|
|
78
|
+
thinking_budget is None
|
|
79
|
+
or not isinstance(thinking_budget, int)
|
|
80
|
+
or thinking_budget < 0
|
|
81
|
+
):
|
|
82
|
+
continue
|
|
83
|
+
req: Req = param_dict.get("__req__")
|
|
84
|
+
cur_ids: list[int] = [*req.origin_input_ids, *req.output_ids]
|
|
85
|
+
|
|
86
|
+
# Check if out of thinking stage
|
|
87
|
+
if (
|
|
88
|
+
self.THINKING_START_TOKEN_ID not in cur_ids
|
|
89
|
+
or self.THINKING_END_TOKEN_ID in cur_ids
|
|
90
|
+
):
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# Find the index of the thinking start token
|
|
94
|
+
start_index = cur_ids.index(self.THINKING_START_TOKEN_ID)
|
|
95
|
+
|
|
96
|
+
# Count the number of tokens after the thinking start token
|
|
97
|
+
num_tokens_after_start = len(cur_ids) - start_index - 1
|
|
98
|
+
|
|
99
|
+
if num_tokens_after_start < thinking_budget:
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
# Ensure new line token before thinking end token
|
|
103
|
+
if not req.output_ids or req.output_ids[-1] != self.NEW_LINE_TOKEN_ID:
|
|
104
|
+
logits[i, :] = -float("inf")
|
|
105
|
+
logits[i, self.NEW_LINE_TOKEN_ID] = 0.0
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Assign highest probability to the thinking end token
|
|
109
|
+
logits[i, :] = -float("inf")
|
|
110
|
+
logits[i, self.THINKING_END_TOKEN_ID] = 0.0
|
|
111
|
+
|
|
112
|
+
return logits
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Qwen3ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
|
|
116
|
+
"""A logit processor that controls the length of thinking for Qwen3 models."""
|
|
117
|
+
|
|
118
|
+
THINKING_START_TOKEN_ID: int = 151667
|
|
119
|
+
THINKING_END_TOKEN_ID: int = 151668
|
|
120
|
+
NEW_LINE_TOKEN_ID: int = 198
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class DeepSeekR1ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
|
|
124
|
+
"""A logit processor that controls the length of thinking for DeepSeek-R1 models."""
|
|
125
|
+
|
|
126
|
+
THINKING_START_TOKEN_ID: int = 128798
|
|
127
|
+
THINKING_END_TOKEN_ID: int = 128799
|
|
128
|
+
NEW_LINE_TOKEN_ID: int = 201
|