sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
 - sglang/bench_one_batch_server.py +41 -25
 - sglang/bench_serving.py +378 -160
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +10 -15
 - sglang/profiler.py +18 -1
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +4 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +13 -64
 - sglang/srt/configs/load_config.py +25 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +136 -25
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +0 -10
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +5 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +20 -11
 - sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +4 -2
 - sglang/srt/disaggregation/decode.py +123 -31
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +157 -19
 - sglang/srt/disaggregation/nixl/conn.py +69 -24
 - sglang/srt/disaggregation/prefill.py +96 -270
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +63 -19
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +83 -80
 - sglang/srt/entrypoints/grpc_server.py +430 -234
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +195 -102
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +225 -37
 - sglang/srt/entrypoints/openai/serving_base.py +49 -2
 - sglang/srt/entrypoints/openai/serving_chat.py +29 -74
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +15 -1
 - sglang/srt/entrypoints/openai/serving_responses.py +5 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +58 -6
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +33 -4
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +20 -14
 - sglang/srt/function_call/glm4_moe_detector.py +1 -5
 - sglang/srt/function_call/gpt_oss_detector.py +1 -1
 - sglang/srt/function_call/json_array_parser.py +0 -2
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/function_call/utils.py +2 -2
 - sglang/srt/grpc/compile_proto.py +3 -3
 - sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
 - sglang/srt/layers/activation.py +10 -1
 - sglang/srt/layers/attention/aiter_backend.py +3 -3
 - sglang/srt/layers/attention/ascend_backend.py +17 -1
 - sglang/srt/layers/attention/attention_registry.py +43 -23
 - sglang/srt/layers/attention/base_attn_backend.py +20 -1
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +24 -10
 - sglang/srt/layers/attention/flashinfer_backend.py +258 -22
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
 - sglang/srt/layers/attention/flashmla_backend.py +2 -2
 - sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
 - sglang/srt/layers/attention/mamba/mamba.py +189 -241
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +0 -1
 - sglang/srt/layers/attention/nsa_backend.py +404 -90
 - sglang/srt/layers/attention/triton_backend.py +208 -34
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
 - sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
 - sglang/srt/layers/attention/utils.py +89 -7
 - sglang/srt/layers/attention/vision.py +3 -3
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +12 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +17 -0
 - sglang/srt/layers/layernorm.py +64 -19
 - sglang/srt/layers/linear.py +9 -1
 - sglang/srt/layers/logits_processor.py +152 -17
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
 - sglang/srt/layers/moe/ep_moe/layer.py +154 -625
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +6 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +7 -6
 - sglang/srt/layers/moe/utils.py +20 -5
 - sglang/srt/layers/quantization/__init__.py +5 -58
 - sglang/srt/layers/quantization/awq.py +183 -9
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +27 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +152 -81
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +42 -14
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +125 -100
 - sglang/srt/layers/quantization/mxfp4.py +35 -68
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +23 -48
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +87 -20
 - sglang/srt/layers/quantization/w8a8_int8.py +30 -24
 - sglang/srt/layers/radix_attention.py +62 -9
 - sglang/srt/layers/rotary_embedding.py +686 -17
 - sglang/srt/layers/sampler.py +47 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +0 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/triton_backend.py +0 -1
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora_manager.py +24 -9
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +40 -16
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
 - sglang/srt/managers/cache_controller.py +48 -17
 - sglang/srt/managers/data_parallel_controller.py +146 -42
 - sglang/srt/managers/detokenizer_manager.py +40 -13
 - sglang/srt/managers/io_struct.py +69 -16
 - sglang/srt/managers/mm_utils.py +20 -18
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
 - sglang/srt/managers/overlap_utils.py +96 -19
 - sglang/srt/managers/schedule_batch.py +241 -511
 - sglang/srt/managers/schedule_policy.py +15 -2
 - sglang/srt/managers/scheduler.py +420 -514
 - sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
 - sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
 - sglang/srt/managers/tokenizer_manager.py +375 -95
 - sglang/srt/managers/tp_worker.py +212 -161
 - sglang/srt/managers/utils.py +78 -2
 - sglang/srt/mem_cache/allocator.py +7 -2
 - sglang/srt/mem_cache/allocator_ascend.py +2 -2
 - sglang/srt/mem_cache/base_prefix_cache.py +2 -2
 - sglang/srt/mem_cache/chunk_cache.py +13 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +16 -1
 - sglang/srt/mem_cache/hicache_storage.py +11 -2
 - sglang/srt/mem_cache/hiradix_cache.py +16 -3
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +517 -219
 - sglang/srt/mem_cache/memory_pool_host.py +0 -1
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +53 -19
 - sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
 - sglang/srt/mem_cache/storage/backend_factory.py +2 -2
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +92 -26
 - sglang/srt/metrics/collector.py +31 -0
 - sglang/srt/metrics/func_timer.py +1 -1
 - sglang/srt/model_executor/cuda_graph_runner.py +43 -5
 - sglang/srt/model_executor/forward_batch_info.py +71 -25
 - sglang/srt/model_executor/model_runner.py +362 -270
 - sglang/srt/model_executor/npu_graph_runner.py +2 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +424 -27
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +47 -28
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +13 -52
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +19 -3
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +418 -140
 - sglang/srt/models/dots_ocr.py +0 -2
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +13 -19
 - sglang/srt/models/gemma3_mm.py +16 -0
 - sglang/srt/models/gemma3n_mm.py +1 -2
 - sglang/srt/models/glm4_moe.py +327 -382
 - sglang/srt/models/glm4_moe_nextn.py +6 -16
 - sglang/srt/models/glm4v.py +2 -1
 - sglang/srt/models/glm4v_moe.py +32 -199
 - sglang/srt/models/gpt_oss.py +5 -5
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +3 -1
 - sglang/srt/models/llama.py +2 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +5 -22
 - sglang/srt/models/longcat_flash_nextn.py +3 -14
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +13 -3
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen2_5_vl.py +3 -3
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +15 -12
 - sglang/srt/models/qwen2_vl.py +5 -2
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +19 -37
 - sglang/srt/models/qwen3_next.py +7 -12
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +37 -33
 - sglang/srt/models/qwen3_vl_moe.py +57 -185
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +0 -1
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/utils.py +11 -1
 - sglang/srt/multimodal/processors/base_processor.py +7 -2
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +0 -1
 - sglang/srt/multimodal/processors/glm4v.py +2 -6
 - sglang/srt/multimodal/processors/internvl.py +0 -2
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +75 -16
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/reasoning_parser.py +28 -2
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +17 -22
 - sglang/srt/sampling/sampling_params.py +70 -2
 - sglang/srt/server_args.py +846 -163
 - sglang/srt/server_args_config_parser.py +1 -1
 - sglang/srt/single_batch_overlap.py +36 -31
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
 - sglang/srt/speculative/eagle_info.py +57 -18
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +138 -0
 - sglang/srt/speculative/eagle_worker.py +83 -280
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
 - sglang/srt/speculative/ngram_worker.py +12 -11
 - sglang/srt/speculative/spec_info.py +2 -0
 - sglang/srt/speculative/spec_utils.py +38 -3
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/two_batch_overlap.py +28 -14
 - sglang/srt/utils/__init__.py +1 -1
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/utils/common.py +272 -82
 - sglang/srt/utils/hf_transformers_utils.py +44 -17
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +41 -0
 - sglang/test/runners.py +2 -0
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +3 -0
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/test_block_fp8.py +1 -2
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +463 -107
 - sglang/test/test_deterministic_utils.py +74 -0
 - sglang/test/test_disaggregation_utils.py +81 -0
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_utils.py +85 -20
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
 - sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
 - sglang/srt/models/vila.py +0 -306
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
| 
         @@ -27,7 +27,7 @@ from transformers import LlamaConfig 
     | 
|
| 
       27 
27 
     | 
    
         | 
| 
       28 
28 
     | 
    
         
             
            from sglang.srt.distributed import get_pp_group
         
     | 
| 
       29 
29 
     | 
    
         
             
            from sglang.srt.layers.layernorm import RMSNorm
         
     | 
| 
       30 
     | 
    
         
            -
            from sglang.srt.layers.linear import QKVParallelLinear 
     | 
| 
      
 30 
     | 
    
         
            +
            from sglang.srt.layers.linear import QKVParallelLinear
         
     | 
| 
       31 
31 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       32 
32 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       33 
33 
     | 
    
         
             
            from sglang.srt.layers.vocab_parallel_embedding import (
         
     | 
| 
         @@ -32,14 +32,10 @@ 
     | 
|
| 
       32 
32 
     | 
    
         | 
| 
       33 
33 
     | 
    
         
             
            import concurrent.futures
         
     | 
| 
       34 
34 
     | 
    
         
             
            import logging
         
     | 
| 
       35 
     | 
    
         
            -
            import  
     | 
| 
       36 
     | 
    
         
            -
            from enum import IntEnum, auto
         
     | 
| 
       37 
     | 
    
         
            -
            from typing import Any, Dict, Iterable, Optional, Tuple, Union
         
     | 
| 
      
 35 
     | 
    
         
            +
            from typing import Iterable, Optional, Tuple
         
     | 
| 
       38 
36 
     | 
    
         | 
| 
       39 
37 
     | 
    
         
             
            import torch
         
     | 
| 
       40 
     | 
    
         
            -
            import torch.nn.functional as F
         
     | 
| 
       41 
38 
     | 
    
         
             
            from torch import nn
         
     | 
| 
       42 
     | 
    
         
            -
            from tqdm import tqdm
         
     | 
| 
       43 
39 
     | 
    
         | 
| 
       44 
40 
     | 
    
         
             
            from sglang.srt.configs import LongcatFlashConfig
         
     | 
| 
       45 
41 
     | 
    
         
             
            from sglang.srt.distributed import (
         
     | 
| 
         @@ -48,9 +44,8 @@ from sglang.srt.distributed import ( 
     | 
|
| 
       48 
44 
     | 
    
         
             
            )
         
     | 
| 
       49 
45 
     | 
    
         
             
            from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
         
     | 
| 
       50 
46 
     | 
    
         
             
            from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
         
     | 
| 
       51 
     | 
    
         
            -
            from sglang.srt. 
     | 
| 
      
 47 
     | 
    
         
            +
            from sglang.srt.layers import deep_gemm_wrapper
         
     | 
| 
       52 
48 
     | 
    
         
             
            from sglang.srt.layers.activation import SiluAndMul
         
     | 
| 
       53 
     | 
    
         
            -
            from sglang.srt.layers.amx_utils import PackWeightMethod
         
     | 
| 
       54 
49 
     | 
    
         
             
            from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
         
     | 
| 
       55 
50 
     | 
    
         
             
            from sglang.srt.layers.dp_attention import (
         
     | 
| 
       56 
51 
     | 
    
         
             
                get_attention_tp_rank,
         
     | 
| 
         @@ -68,7 +63,6 @@ from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton 
     | 
|
| 
       68 
63 
     | 
    
         
             
            from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
         
     | 
| 
       69 
64 
     | 
    
         
             
            from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
         
     | 
| 
       70 
65 
     | 
    
         
             
            from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
         
     | 
| 
       71 
     | 
    
         
            -
            from sglang.srt.layers.quantization import deep_gemm_wrapper
         
     | 
| 
       72 
66 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       73 
67 
     | 
    
         
             
            from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
         
     | 
| 
       74 
68 
     | 
    
         
             
            from sglang.srt.layers.quantization.fp8_utils import (
         
     | 
| 
         @@ -85,26 +79,21 @@ from sglang.srt.layers.vocab_parallel_embedding import ( 
     | 
|
| 
       85 
79 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       86 
80 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       87 
81 
     | 
    
         
             
            )
         
     | 
| 
       88 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       89 
82 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch
         
     | 
| 
       90 
83 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import default_weight_loader
         
     | 
| 
       91 
84 
     | 
    
         
             
            from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
         
     | 
| 
      
 85 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       92 
86 
     | 
    
         
             
            from sglang.srt.utils import (
         
     | 
| 
       93 
87 
     | 
    
         
             
                BumpAllocator,
         
     | 
| 
       94 
     | 
    
         
            -
                LazyValue,
         
     | 
| 
       95 
88 
     | 
    
         
             
                add_prefix,
         
     | 
| 
       96 
89 
     | 
    
         
             
                bind_or_assign,
         
     | 
| 
       97 
90 
     | 
    
         
             
                cpu_has_amx_support,
         
     | 
| 
       98 
91 
     | 
    
         
             
                get_bool_env_var,
         
     | 
| 
       99 
92 
     | 
    
         
             
                get_device_sm,
         
     | 
| 
       100 
     | 
    
         
            -
                get_int_env_var,
         
     | 
| 
       101 
93 
     | 
    
         
             
                is_cpu,
         
     | 
| 
       102 
94 
     | 
    
         
             
                is_cuda,
         
     | 
| 
       103 
     | 
    
         
            -
                is_flashinfer_available,
         
     | 
| 
       104 
95 
     | 
    
         
             
                is_hip,
         
     | 
| 
       105 
     | 
    
         
            -
                is_non_idle_and_non_empty,
         
     | 
| 
       106 
96 
     | 
    
         
             
                is_npu,
         
     | 
| 
       107 
     | 
    
         
            -
                is_sm100_supported,
         
     | 
| 
       108 
97 
     | 
    
         
             
            )
         
     | 
| 
       109 
98 
     | 
    
         | 
| 
       110 
99 
     | 
    
         
             
            _is_hip = is_hip()
         
     | 
| 
         @@ -117,13 +106,7 @@ _is_cpu = is_cpu() 
     | 
|
| 
       117 
106 
     | 
    
         
             
            _device_sm = get_device_sm()
         
     | 
| 
       118 
107 
     | 
    
         | 
| 
       119 
108 
     | 
    
         
             
            if _is_cuda:
         
     | 
| 
       120 
     | 
    
         
            -
                from sgl_kernel import  
     | 
| 
       121 
     | 
    
         
            -
                    awq_dequantize,
         
     | 
| 
       122 
     | 
    
         
            -
                    bmm_fp8,
         
     | 
| 
       123 
     | 
    
         
            -
                    dsv3_fused_a_gemm,
         
     | 
| 
       124 
     | 
    
         
            -
                    dsv3_router_gemm,
         
     | 
| 
       125 
     | 
    
         
            -
                    merge_state_v2,
         
     | 
| 
       126 
     | 
    
         
            -
                )
         
     | 
| 
      
 109 
     | 
    
         
            +
                from sgl_kernel import awq_dequantize
         
     | 
| 
       127 
110 
     | 
    
         
             
            elif _is_cpu and _is_cpu_amx_available:
         
     | 
| 
       128 
111 
     | 
    
         
             
                pass
         
     | 
| 
       129 
112 
     | 
    
         
             
            elif _is_hip:
         
     | 
| 
         @@ -595,7 +578,7 @@ class LongcatFlashForCausalLM(nn.Module): 
     | 
|
| 
       595 
578 
     | 
    
         
             
                        config.hidden_size,
         
     | 
| 
       596 
579 
     | 
    
         
             
                        quant_config=quant_config,
         
     | 
| 
       597 
580 
     | 
    
         
             
                        prefix=add_prefix("lm_head", prefix),
         
     | 
| 
       598 
     | 
    
         
            -
                        use_attn_tp_group= 
     | 
| 
      
 581 
     | 
    
         
            +
                        use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
         
     | 
| 
       599 
582 
     | 
    
         
             
                    )
         
     | 
| 
       600 
583 
     | 
    
         
             
                    self.logits_processor = LogitsProcessor(config)
         
     | 
| 
       601 
584 
     | 
    
         | 
| 
         @@ -32,17 +32,14 @@ 
     | 
|
| 
       32 
32 
     | 
    
         | 
| 
       33 
33 
     | 
    
         
             
            import concurrent.futures
         
     | 
| 
       34 
34 
     | 
    
         
             
            import logging
         
     | 
| 
       35 
     | 
    
         
            -
            import  
     | 
| 
       36 
     | 
    
         
            -
            from enum import IntEnum, auto
         
     | 
| 
       37 
     | 
    
         
            -
            from typing import Any, Dict, Iterable, Optional, Tuple, Union
         
     | 
| 
      
 35 
     | 
    
         
            +
            from typing import Iterable, Optional, Tuple
         
     | 
| 
       38 
36 
     | 
    
         | 
| 
       39 
37 
     | 
    
         
             
            import torch
         
     | 
| 
       40 
     | 
    
         
            -
            import torch.nn.functional as F
         
     | 
| 
       41 
38 
     | 
    
         
             
            from torch import nn
         
     | 
| 
       42 
     | 
    
         
            -
            from tqdm import tqdm
         
     | 
| 
       43 
39 
     | 
    
         | 
| 
       44 
40 
     | 
    
         
             
            from sglang.srt.configs import LongcatFlashConfig
         
     | 
| 
       45 
41 
     | 
    
         
             
            from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
         
     | 
| 
      
 42 
     | 
    
         
            +
            from sglang.srt.layers import deep_gemm_wrapper
         
     | 
| 
       46 
43 
     | 
    
         
             
            from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
         
     | 
| 
       47 
44 
     | 
    
         
             
            from sglang.srt.layers.dp_attention import (
         
     | 
| 
       48 
45 
     | 
    
         
             
                get_attention_tp_rank,
         
     | 
| 
         @@ -52,7 +49,6 @@ from sglang.srt.layers.dp_attention import ( 
     | 
|
| 
       52 
49 
     | 
    
         
             
            from sglang.srt.layers.layernorm import RMSNorm
         
     | 
| 
       53 
50 
     | 
    
         
             
            from sglang.srt.layers.linear import ReplicatedLinear
         
     | 
| 
       54 
51 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       55 
     | 
    
         
            -
            from sglang.srt.layers.quantization import deep_gemm_wrapper
         
     | 
| 
       56 
52 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       57 
53 
     | 
    
         
             
            from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
         
     | 
| 
       58 
54 
     | 
    
         
             
            from sglang.srt.layers.quantization.fp8_utils import (
         
     | 
| 
         @@ -75,7 +71,6 @@ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA 
     | 
|
| 
       75 
71 
     | 
    
         
             
            from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
         
     | 
| 
       76 
72 
     | 
    
         
             
            from sglang.srt.utils import (
         
     | 
| 
       77 
73 
     | 
    
         
             
                BumpAllocator,
         
     | 
| 
       78 
     | 
    
         
            -
                LazyValue,
         
     | 
| 
       79 
74 
     | 
    
         
             
                add_prefix,
         
     | 
| 
       80 
75 
     | 
    
         
             
                bind_or_assign,
         
     | 
| 
       81 
76 
     | 
    
         
             
                cpu_has_amx_support,
         
     | 
| 
         @@ -97,13 +92,7 @@ _is_cpu = is_cpu() 
     | 
|
| 
       97 
92 
     | 
    
         
             
            _device_sm = get_device_sm()
         
     | 
| 
       98 
93 
     | 
    
         | 
| 
       99 
94 
     | 
    
         
             
            if _is_cuda:
         
     | 
| 
       100 
     | 
    
         
            -
                from sgl_kernel import  
     | 
| 
       101 
     | 
    
         
            -
                    awq_dequantize,
         
     | 
| 
       102 
     | 
    
         
            -
                    bmm_fp8,
         
     | 
| 
       103 
     | 
    
         
            -
                    dsv3_fused_a_gemm,
         
     | 
| 
       104 
     | 
    
         
            -
                    dsv3_router_gemm,
         
     | 
| 
       105 
     | 
    
         
            -
                    merge_state_v2,
         
     | 
| 
       106 
     | 
    
         
            -
                )
         
     | 
| 
      
 95 
     | 
    
         
            +
                from sgl_kernel import awq_dequantize
         
     | 
| 
       107 
96 
     | 
    
         
             
            elif _is_cpu and _is_cpu_amx_available:
         
     | 
| 
       108 
97 
     | 
    
         
             
                pass
         
     | 
| 
       109 
98 
     | 
    
         
             
            elif _is_hip:
         
     | 
    
        sglang/srt/models/mimo.py
    CHANGED
    
    | 
         @@ -1,28 +1,17 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # Adapted from qwen2.py
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            from  
     | 
| 
       4 
     | 
    
         
            -
            from typing import Any, Dict, Iterable, Optional, Tuple
         
     | 
| 
      
 3 
     | 
    
         
            +
            from typing import Iterable, Optional, Tuple
         
     | 
| 
       5 
4 
     | 
    
         | 
| 
       6 
5 
     | 
    
         
             
            import torch
         
     | 
| 
       7 
6 
     | 
    
         
             
            from torch import nn
         
     | 
| 
       8 
7 
     | 
    
         | 
| 
       9 
     | 
    
         
            -
            from sglang.srt.distributed import (
         
     | 
| 
       10 
     | 
    
         
            -
                get_tensor_model_parallel_rank,
         
     | 
| 
       11 
     | 
    
         
            -
                get_tensor_model_parallel_world_size,
         
     | 
| 
       12 
     | 
    
         
            -
                split_tensor_along_last_dim,
         
     | 
| 
       13 
     | 
    
         
            -
                tensor_model_parallel_all_gather,
         
     | 
| 
       14 
     | 
    
         
            -
            )
         
     | 
| 
       15 
     | 
    
         
            -
            from sglang.srt.layers.layernorm import RMSNorm
         
     | 
| 
       16 
     | 
    
         
            -
            from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
         
     | 
| 
       17 
8 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       18 
9 
     | 
    
         
             
            from sglang.srt.layers.pooler import Pooler, PoolingType
         
     | 
| 
       19 
10 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       20 
     | 
    
         
            -
            from sglang.srt.layers.radix_attention import RadixAttention
         
     | 
| 
       21 
     | 
    
         
            -
            from sglang.srt.layers.rotary_embedding import get_rope
         
     | 
| 
       22 
11 
     | 
    
         
             
            from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
         
     | 
| 
       23 
12 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch
         
     | 
| 
       24 
13 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import default_weight_loader
         
     | 
| 
       25 
     | 
    
         
            -
            from sglang.srt.models.qwen2 import Qwen2DecoderLayer,  
     | 
| 
      
 14 
     | 
    
         
            +
            from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2Model
         
     | 
| 
       26 
15 
     | 
    
         
             
            from sglang.srt.utils import add_prefix
         
     | 
| 
       27 
16 
     | 
    
         | 
| 
       28 
17 
     | 
    
         
             
            MiMoConfig = None
         
     | 
    
        sglang/srt/models/mimo_mtp.py
    CHANGED
    
    | 
         @@ -1,7 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # Adapted from https://github.com/vllm-project/vllm/pull/17433/files  and deepseek_nextn.py
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            from  
     | 
| 
       4 
     | 
    
         
            -
            from typing import Any, Dict, Iterable, Optional, Tuple
         
     | 
| 
      
 3 
     | 
    
         
            +
            from typing import Iterable, Optional, Tuple
         
     | 
| 
       5 
4 
     | 
    
         | 
| 
       6 
5 
     | 
    
         
             
            import torch
         
     | 
| 
       7 
6 
     | 
    
         
             
            from torch import nn
         
     | 
    
        sglang/srt/models/minicpmo.py
    CHANGED
    
    | 
         @@ -43,7 +43,6 @@ from sglang.srt.managers.mm_utils import ( 
     | 
|
| 
       43 
43 
     | 
    
         
             
                general_mm_embed_routine,
         
     | 
| 
       44 
44 
     | 
    
         
             
            )
         
     | 
| 
       45 
45 
     | 
    
         
             
            from sglang.srt.managers.schedule_batch import (
         
     | 
| 
       46 
     | 
    
         
            -
                Modality,
         
     | 
| 
       47 
46 
     | 
    
         
             
                MultimodalDataItem,
         
     | 
| 
       48 
47 
     | 
    
         
             
                MultimodalInputs,
         
     | 
| 
       49 
48 
     | 
    
         
             
                flatten_nested_list,
         
     | 
| 
         @@ -59,8 +58,6 @@ from sglang.srt.utils import logger 
     | 
|
| 
       59 
58 
     | 
    
         
             
            try:
         
     | 
| 
       60 
59 
     | 
    
         
             
                from transformers import LogitsWarper
         
     | 
| 
       61 
60 
     | 
    
         
             
                from vector_quantize_pytorch import GroupedResidualFSQ
         
     | 
| 
       62 
     | 
    
         
            -
                from vocos import Vocos
         
     | 
| 
       63 
     | 
    
         
            -
                from vocos.pretrained import instantiate_class
         
     | 
| 
       64 
61 
     | 
    
         | 
| 
       65 
62 
     | 
    
         
             
                _tts_deps = True
         
     | 
| 
       66 
63 
     | 
    
         
             
            except:
         
     | 
| 
         @@ -795,8 +792,10 @@ class ConditionalChatTTS(PreTrainedModel): 
     | 
|
| 
       795 
792 
     | 
    
         
             
                    force_no_stop=False,
         
     | 
| 
       796 
793 
     | 
    
         
             
                    min_new_token=10,
         
     | 
| 
       797 
794 
     | 
    
         
             
                    max_new_token=50,
         
     | 
| 
       798 
     | 
    
         
            -
                    logits_warpers: List[LogitsWarper] =  
     | 
| 
       799 
     | 
    
         
            -
                    logits_processors:  
     | 
| 
      
 795 
     | 
    
         
            +
                    logits_warpers: Optional[List[LogitsWarper]] = None,
         
     | 
| 
      
 796 
     | 
    
         
            +
                    logits_processors: Optional[
         
     | 
| 
      
 797 
     | 
    
         
            +
                        List[CustomRepetitionPenaltyLogitsProcessorRepeat]
         
     | 
| 
      
 798 
     | 
    
         
            +
                    ] = None,
         
     | 
| 
       800 
799 
     | 
    
         
             
                    show_tqdm=False,
         
     | 
| 
       801 
800 
     | 
    
         
             
                ):
         
     | 
| 
       802 
801 
     | 
    
         
             
                    """Generate audio codes in streaming setting or non-streaming setting.
         
     | 
| 
         @@ -825,6 +824,9 @@ class ConditionalChatTTS(PreTrainedModel): 
     | 
|
| 
       825 
824 
     | 
    
         
             
                    assert input_ids.shape[0] == 1
         
     | 
| 
       826 
825 
     | 
    
         
             
                    assert past_key_values is not None
         
     | 
| 
       827 
826 
     | 
    
         | 
| 
      
 827 
     | 
    
         
            +
                    logits_warpers = logits_warpers or []
         
     | 
| 
      
 828 
     | 
    
         
            +
                    logits_processors = logits_processors or []
         
     | 
| 
      
 829 
     | 
    
         
            +
             
     | 
| 
       828 
830 
     | 
    
         
             
                    # fix: this should not be `input_ids.shape[1]`
         
     | 
| 
       829 
831 
     | 
    
         
             
                    # start_idx = input_ids.shape[1]
         
     | 
| 
       830 
832 
     | 
    
         
             
                    start_idx = (
         
     |