PyPI - sglang - Versions diffs - 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +267 -32
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/model_config.py +181 -82
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +71 -19
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +326 -53
sglang/srt/disaggregation/prefill.py +36 -17
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +156 -80
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +855 -0
sglang/srt/entrypoints/grpc_server.py +810 -0
sglang/srt/entrypoints/http_server.py +130 -59
sglang/srt/entrypoints/openai/protocol.py +112 -4
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +204 -55
sglang/srt/entrypoints/openai/serving_completions.py +14 -3
sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +48 -3
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +9 -2
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +118 -198
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
sglang/srt/layers/attention/mamba/mamba.py +629 -0
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +44 -12
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +256 -63
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +22 -6
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +78 -49
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +190 -55
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +74 -42
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +26 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +52 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +199 -301
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +240 -138
sglang/srt/managers/schedule_policy.py +144 -17
sglang/srt/managers/scheduler.py +502 -209
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +320 -632
sglang/srt/managers/tp_worker.py +81 -22
sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +43 -24
sglang/srt/mem_cache/hiradix_cache.py +222 -75
sglang/srt/mem_cache/memory_pool.py +535 -58
sglang/srt/mem_cache/memory_pool_host.py +239 -228
sglang/srt/mem_cache/radix_cache.py +222 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
sglang/srt/mem_cache/swa_radix_cache.py +25 -36
sglang/srt/metrics/collector.py +511 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +82 -40
sglang/srt/model_executor/model_runner.py +432 -157
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +133 -5
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +158 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +607 -130
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +4 -4
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +40 -4
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +120 -13
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +32 -4
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +38 -17
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +966 -267
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +99 -28
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +433 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +77 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +383 -5
sglang/utils.py +21 -1
sglang/version.py +1 -1
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/internvl.py CHANGED Viewed

@@ -1,9 +1,13 @@
 # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+from functools import lru_cache
 import numpy as np
 import torch
-from decord import VideoReader, cpu
+import torchvision.transforms as T
+from decord import VideoReader, cpu, gpu
 from PIL import Image
+from torchvision.transforms import InterpolationMode
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.interns1 import InternS1ForConditionalGeneration
@@ -17,6 +21,20 @@ from sglang.srt.multimodal.processors.base_processor import (
 class InternVLImageProcessor(BaseMultimodalProcessor):
     models = [InternVLChatModel, InternS1ForConditionalGeneration]
+    IMAGENET_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_STD = [0.229, 0.224, 0.225]
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def _get_normalize_tensors(device="cuda", dtype=torch.float32):
+        mean = torch.tensor(
+            InternVLImageProcessor.IMAGENET_MEAN, device=device, dtype=dtype
+        ).view(-1, 1, 1)
+        std = torch.tensor(
+            InternVLImageProcessor.IMAGENET_STD, device=device, dtype=dtype
+        ).view(-1, 1, 1)
+        return mean, std
     def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
         image_size = (
@@ -48,99 +66,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
         ).build(_image_processor)
-    @staticmethod
-    def build_transform(input_size):
-        IMAGENET_MEAN = (0.485, 0.456, 0.406)
-        IMAGENET_STD = (0.229, 0.224, 0.225)
-        def resize_image(img, size):
-            return img.resize((size, size), Image.Resampling.BICUBIC)
-        def to_tensor(img):
-            # Convert PIL Image to numpy array
-            img_array = np.array(img).astype(np.float32) / 255.0
-            # Convert HWC to CHW format
-            img_array = img_array.transpose(2, 0, 1)
-            return torch.from_numpy(img_array)
-        def normalize(tensor, mean, std):
-            mean = torch.tensor(mean).view(-1, 1, 1)
-            std = torch.tensor(std).view(-1, 1, 1)
-            return (tensor - mean) / std
-        def transform(img):
-            img = img.convert("RGB") if img.mode != "RGB" else img
-            img = resize_image(img, input_size)
-            tensor = to_tensor(img)
-            tensor = normalize(tensor, IMAGENET_MEAN, IMAGENET_STD)
-            return tensor
-        return transform
-    @staticmethod
-    def dynamic_preprocess(
-        image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
-    ):
-        def find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, width, height, image_size
-        ):
-            best_ratio_diff = float("inf")
-            best_ratio = (1, 1)
-            area = width * height
-            for ratio in target_ratios:
-                target_aspect_ratio = ratio[0] / ratio[1]
-                ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-                if ratio_diff < best_ratio_diff:
-                    best_ratio_diff = ratio_diff
-                    best_ratio = ratio
-                elif ratio_diff == best_ratio_diff:
-                    if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                        best_ratio = ratio
-            return best_ratio
-        orig_width, orig_height = image.size
-        aspect_ratio = orig_width / orig_height
-        # calculate the existing image aspect ratio
-        target_ratios = set(
-            (i, j)
-            for n in range(min_num, max_num + 1)
-            for i in range(1, n + 1)
-            for j in range(1, n + 1)
-            if i * j <= max_num and i * j >= min_num
-        )
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size
-        )
-        # calculate the target width and height
-        target_width = image_size * target_aspect_ratio[0]
-        target_height = image_size * target_aspect_ratio[1]
-        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-        # resize the image
-        resized_img = image.resize((target_width, target_height))
-        processed_images = []
-        for i in range(blocks):
-            box = (
-                (i % (target_width // image_size)) * image_size,
-                (i // (target_width // image_size)) * image_size,
-                ((i % (target_width // image_size)) + 1) * image_size,
-                ((i // (target_width // image_size)) + 1) * image_size,
-            )
-            # split the image
-            split_img = resized_img.crop(box)
-            processed_images.append(split_img)
-        assert len(processed_images) == blocks
-        if use_thumbnail and len(processed_images) != 1:
-            thumbnail_img = image.resize((image_size, image_size))
-            processed_images.append(thumbnail_img)
-        return processed_images
     @staticmethod
     def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
         if bound:
@@ -160,27 +85,110 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
     @staticmethod
     def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
-        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        try:
+            vr = VideoReader(video_path, ctx=gpu(0), num_threads=1)
+            use_gpu = True
+        except (RuntimeError, OSError) as e:
+            print(
+                f"[WARNING] Load video on gpu decoding failed: {e}. Falling back to CPU."
+            )
+            vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+            use_gpu = False
         max_frame = len(vr) - 1
         fps = float(vr.get_avg_fps())
-        pixel_values_list, num_patches_list = [], []
-        transform = InternVLImageProcessor.build_transform(input_size=input_size)
+        pixel_values_list = []
+        num_patches_list = []
         frame_indices = InternVLImageProcessor.get_index(
             bound, fps, max_frame, first_idx=0, num_segments=num_segments
         )
+        mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
         for frame_index in frame_indices:
-            img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
-            img = InternVLImageProcessor.dynamic_preprocess(
-                img, image_size=input_size, use_thumbnail=True, max_num=max_num
+            # Load frame
+            frame = vr[frame_index]
+            if use_gpu:
+                img = frame.cuda().permute(2, 0, 1).float() / 255.0
+            else:
+                img_np = frame.asnumpy()
+                img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
+            img = (img - mean) / std
+            tiles = InternVLImageProcessor.dynamic_preprocess(
+                img, image_size=input_size, max_num=max_num, use_thumbnail=True
             )
-            pixel_values = [transform(tile) for tile in img]
-            pixel_values = torch.stack(pixel_values)
-            num_patches_list.append(pixel_values.shape[0])
-            pixel_values_list.append(pixel_values)
-        pixel_values = torch.cat(pixel_values_list)
+            pixel_values_list.append(tiles)
+            num_patches_list.append(tiles.shape[0])
+        pixel_values = torch.cat(pixel_values_list, dim=0)
         return pixel_values, num_patches_list
+    @staticmethod
+    def dynamic_preprocess(tensor, image_size=448, max_num=12, use_thumbnail=False):
+        C, H, W = tensor.shape
+        aspect_ratio = W / H
+        # Generate all possible aspect ratios
+        target_ratios = set(
+            (i, j)
+            for n in range(1, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # Find closest ratio
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        for x, y in target_ratios:
+            target_ar = x / y
+            diff = abs(aspect_ratio - target_ar)
+            blocks = x * y
+            best_blocks = best_ratio[0] * best_ratio[1]
+            if diff < best_ratio_diff:
+                best_ratio_diff = diff
+                best_ratio = (x, y)
+            elif diff == best_ratio_diff and blocks > best_blocks:
+                best_ratio = (x, y)
+        target_w, target_h = image_size * best_ratio[0], image_size * best_ratio[1]
+        blocks = best_ratio[0] * best_ratio[1]
+        # Resize on GPU
+        resized = torch.nn.functional.interpolate(
+            tensor.unsqueeze(0),
+            size=(target_h, target_w),
+            mode="bicubic",
+            align_corners=False,
+        ).squeeze(0)
+        # Split into tiles
+        tiles = []
+        for i in range(blocks):
+            x = (i % best_ratio[0]) * image_size
+            y = (i // best_ratio[0]) * image_size
+            tile = resized[:, y : y + image_size, x : x + image_size]
+            tiles.append(tile)
+        # Add thumbnail if needed
+        if use_thumbnail and len(tiles) > 1:
+            thumb = torch.nn.functional.interpolate(
+                tensor.unsqueeze(0),
+                size=(image_size, image_size),
+                mode="bicubic",
+                align_corners=False,
+            ).squeeze(0)
+            tiles.append(thumb)
+        return torch.stack(tiles).to(torch.bfloat16)
     async def process_mm_data_async(
         self, image_data, input_text, request_obj, **kwargs
     ):
@@ -191,53 +199,69 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             discard_alpha_channel=True,
         )
-        def process_image_internvl(image, input_size=448, max_num=12):
-            transform = InternVLImageProcessor.build_transform(input_size=input_size)
-            images = InternVLImageProcessor.dynamic_preprocess(
-                image, image_size=input_size, use_thumbnail=True, max_num=max_num
-            )
-            pixel_values = [transform(image) for image in images]
-            pixel_values = torch.stack(pixel_values)
-            return pixel_values
         num_patches_list = []
         pixel_values = []
+        mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
         # Process each input with allocated frames
-        for image_index, (image) in enumerate(base_output.images):
+        for image_index, image in enumerate(base_output.images):
             try:
                 # TODO: video input
-                raw_image = process_image_internvl(image)
-                pixel_value = [raw_image.to(torch.bfloat16)]
-                pixel_values += pixel_value
-                num_patches = raw_image.shape[0]
-                num_patches_list += [num_patches]
-            except FileNotFoundError as e:
-                print(e)
+                # Convert PIL to GPU tensor
+                if isinstance(image, Image.Image):
+                    img_np = np.array(image.convert("RGB"))
+                    tensor = (
+                        torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
+                    )
+                else:
+                    tensor = image.cuda()  # assume already tensor
+                tensor = (tensor - mean) / std
+                tiles = self.dynamic_preprocess(
+                    tensor, image_size=448, max_num=12, use_thumbnail=True
+                )
+                pixel_values.append(tiles)
+                num_patches_list.append(tiles.shape[0])
+            except Exception as e:
+                print(f"[Error] Failed to process image {image_index}: {e}")
                 return None
+        # Concatenate all
         pixel_values = torch.cat(pixel_values, dim=0)
         original_placeholder = "<<<__IMG_CONTEXT_PLACEHOLDER__>>>"
         input_text = input_text.replace(self.IMG_CONTEXT_TOKEN, original_placeholder)
-        for idx, num_patches in enumerate(num_patches_list):
+        input_text_updated = input_text
+        for num_patches in num_patches_list:
             image_tokens = (
                 self.IMG_START_TOKEN
                 + self.IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
                 + self.IMG_END_TOKEN
             )
-            input_text = input_text.replace(original_placeholder, image_tokens, 1)
+            input_text_updated = input_text_updated.replace(
+                original_placeholder, image_tokens, 1
+            )
-        input_text = input_text.replace(original_placeholder, self.IMG_CONTEXT_TOKEN)
+        input_text_updated = input_text_updated.replace(
+            original_placeholder, self.IMG_CONTEXT_TOKEN
+        )
-        input_ids = self.tokenizer(input_text, return_tensors="pt")[
+        # Tokenize
+        input_ids_tensor = self.tokenizer(input_text_updated, return_tensors="pt")[
             "input_ids"
         ].flatten()
+        input_ids = input_ids_tensor.tolist()
+        # Get image token offsets
         image_offsets = self.get_mm_items_offset(
-            input_ids=input_ids,
+            input_ids=input_ids_tensor.to("cuda"),
             mm_token_id=self.mm_tokens.image_token_id,
         )
         items = [
             MultimodalDataItem(
                 feature=pixel_values,
@@ -247,7 +271,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         ]
         return {
-            "input_ids": input_ids.tolist(),
+            "input_ids": input_ids,
             "mm_items": items,
             "im_start_id": self.img_start_token_id,
             "im_end_id": self.img_end_token_id,

sglang/srt/multimodal/processors/qwen_vl.py CHANGED Viewed

@@ -12,6 +12,8 @@ from torchvision.transforms import InterpolationMode
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
+from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
+from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
@@ -67,10 +69,15 @@ def smart_resize(
     return h_bar, w_bar
-def resize_image(image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
+def resize_image(
+    image,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
+    size_factor: int = IMAGE_FACTOR,
+) -> Image.Image:
     width, height = image.size
-    min_pixels = MIN_PIXELS
-    max_pixels = MAX_PIXELS
+    min_pixels = min_pixels
+    max_pixels = max_pixels
     resized_height, resized_width = smart_resize(
         height,
         width,
@@ -97,8 +104,13 @@ def floor_by_factor(number: int, factor: int) -> int:
     return math.floor(number / factor) * factor
-async def resize_image_async(image):
-    return resize_image(image)
+async def resize_image_async(
+    image,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
+    size_factor: int = IMAGE_FACTOR,
+):
+    return resize_image(image, min_pixels, max_pixels, size_factor)
 def smart_nframes(
@@ -199,7 +211,12 @@ async def preprocess_video(
 # Compatible with Qwen2VL and Qwen2_5VL
 class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
-    models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
+    models = [
+        Qwen2VLForConditionalGeneration,
+        Qwen2_5_VLForConditionalGeneration,
+        Qwen3VLForConditionalGeneration,
+        Qwen3VLMoeForConditionalGeneration,
+    ]
     def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _processor, *args, **kwargs)

sglang/srt/multimodal/processors/sarashina2_vision.py ADDED Viewed

@@ -0,0 +1,81 @@
+from typing import List, Union
+from sglang.srt.models.sarashina2_vision import Sarashina2VisionForCausalLM
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+class Sarashina2VisionProcessor(BaseMultimodalProcessor):
+    models = [Sarashina2VisionForCausalLM]
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        # Sarashina2Vision specific tokens (default is <|file|>)
+        self.IMAGE_TOKEN = "<|file|>"
+        self.IM_TOKEN_ID = getattr(hf_config, "image_token_index", 14)
+        self.IM_START_ID = getattr(hf_config, "start_image_token_index", 102397)
+        self.IM_END_ID = getattr(hf_config, "end_image_token_index", 102398)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.IMAGE_TOKEN,
+            image_token_id=self.IM_TOKEN_ID,
+        ).build(_processor)
+        # Patch the processor's image processor to handle parameter compatibility
+        if hasattr(_processor, "image_processor") and hasattr(
+            _processor.image_processor, "_preprocess"
+        ):
+            original_preprocess = _processor.image_processor._preprocess
+            def patched_preprocess(*args, **kwargs):
+                # Filter kwargs to only include parameters that the custom _preprocess method accepts
+                # Based on Sarashina2VisionImageProcessor._preprocess signature
+                allowed_params = {
+                    "do_resize",
+                    "resample",
+                    "do_rescale",
+                    "rescale_factor",
+                    "do_normalize",
+                    "image_mean",
+                    "image_std",
+                    "do_convert_rgb",
+                    "data_format",
+                    "input_data_format",
+                }
+                filtered_kwargs = {
+                    k: v for k, v in kwargs.items() if k in allowed_params
+                }
+                return original_preprocess(*args, **filtered_kwargs)
+            _processor.image_processor._preprocess = patched_preprocess
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        """Process image data for Sarashina2Vision model using standard SGLang pattern."""
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output=base_output,
+            mm_tokens=self.mm_tokens,
+        )
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self.mm_tokens.image_token_id,
+            "im_start_id": self.IM_START_ID,
+            "im_end_id": self.IM_END_ID,
+        }

sglang/srt/offloader.py CHANGED Viewed

@@ -38,6 +38,10 @@ class BaseOffloader(ABC):
     def post_init(self):
         pass
+    @property
+    def forbid_copy_engine_usage(self):
+        return False
 class NoopOffloader(BaseOffloader):
     pass
@@ -233,6 +237,10 @@ class OffloaderV2(BaseOffloader):
         for i in range(self.prefetch_step):
             self.offloaders[i].start_onload()
+    @property
+    def forbid_copy_engine_usage(self):
+        return self.mode == "cpu"
 def _hook_module_forward_for_offloader(index, module, offloaders, prefetch_step):
     def _on_forward_end():
@@ -398,14 +406,30 @@ class _ShmCpuParamOffloader(_BaseParamOffloader):
         return self.shm_cpu_data.to("cuda", non_blocking=True)
+def update_param(param, new_tensor):
+    """Update parameter while keeping properties needed by Offloader (e.g. pinned host memory)."""
+    if param.device == new_tensor.device:
+        param.data = new_tensor
+    else:
+        assert param.device == torch.device(
+            "cpu"
+        ), f"{param.device=} {new_tensor.device=}"
+        param.data = _create_cpu_data(new_tensor, pin_memory=True)
 def _move_param_to_cpu(param, pin_memory: bool):
+    param.data = _create_cpu_data(param.data, pin_memory=pin_memory)
+def _create_cpu_data(data, pin_memory: bool):
     cpu_data = _empty_strided_like(
-        param.data,
+        data,
         device="cpu",
         pin_memory=pin_memory,
     )
-    cpu_data.copy_(param.data)
-    param.data = cpu_data
+    cpu_data.copy_(data)
+    return cpu_data
 def _move_param_to_meta(module, param_name):

sglang/srt/parser/jinja_template_utils.py CHANGED Viewed

@@ -89,6 +89,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
     - If template has loops like {%- for content in message['content'] -%} → 'openai'
     - Otherwise → 'string'
     """
+    # Shortcut for multimodal templates
+    if any(
+        keyword in chat_template for keyword in ["image", "audio", "video", "vision"]
+    ):
+        return "openai"
     jinja_ast = _try_extract_ast(chat_template)
     if jinja_ast is None:
         return "string"

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -60,6 +60,9 @@ class SamplingBatchInfo:
         Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
     ] = None
+    # Used for deterministic sampling
+    sampling_seed: Optional[torch.Tensor] = None
     # Device
     device: str = "cuda"
@@ -67,28 +70,41 @@ class SamplingBatchInfo:
     logit_bias: Optional[torch.Tensor] = None
     @classmethod
-    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
+    def _get_global_server_args_dict(cls):
         from sglang.srt.managers.schedule_batch import global_server_args_dict
+        return global_server_args_dict
+    @classmethod
+    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
+        global_server_args_dict = cls._get_global_server_args_dict()
+        enable_deterministic = global_server_args_dict["enable_deterministic_inference"]
         reqs = batch.reqs
         device = batch.device
-        temperatures = (
-            torch.tensor(
-                [r.sampling_params.temperature for r in reqs],
-                dtype=torch.float,
-            )
-            .view(-1, 1)
-            .to(device, non_blocking=True)
-        )
+        temperatures = torch.tensor(
+            [r.sampling_params.temperature for r in reqs],
+            dtype=torch.float,
+            device=device,
+        ).view(-1, 1)
         top_ps = torch.tensor(
-            [r.sampling_params.top_p for r in reqs], dtype=torch.float
-        ).to(device, non_blocking=True)
+            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
+        )
         top_ks = torch.tensor(
-            [r.sampling_params.top_k for r in reqs], dtype=torch.int32
-        ).to(device, non_blocking=True)
+            [r.sampling_params.top_k for r in reqs], dtype=torch.int32, device=device
+        )
         min_ps = torch.tensor(
-            [r.sampling_params.min_p for r in reqs], dtype=torch.float
-        ).to(device, non_blocking=True)
+            [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
+        )
+        sampling_seed = (
+            torch.tensor(
+                [r.sampling_params.sampling_seed for r in reqs],
+                dtype=torch.int32,
+                device=device,
+            )
+            if enable_deterministic
+            else None
+        )
         logit_bias = None
         if any(r.sampling_params.logit_bias is not None for r in reqs):
@@ -154,6 +170,7 @@ class SamplingBatchInfo:
             top_ps=top_ps,
             top_ks=top_ks,
             min_ps=min_ps,
+            sampling_seed=sampling_seed,
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
             need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
@@ -235,9 +252,11 @@ class SamplingBatchInfo:
             "top_ps",
             "top_ks",
             "min_ps",
+            "sampling_seed",
         ]:
             value = getattr(self, item, None)
-            setattr(self, item, value[keep_indices_device])
+            if value is not None:
+                setattr(self, item, value[keep_indices_device])
         if self.logit_bias is not None:
             self.logit_bias = self.logit_bias[keep_indices_device]
@@ -339,10 +358,12 @@ class SamplingBatchInfo:
             "top_ps",
             "top_ks",
             "min_ps",
+            "sampling_seed",
         ]:
             self_val = getattr(self, item, None)
             other_val = getattr(other, item, None)
-            setattr(self, item, torch.cat([self_val, other_val]))
+            if self_val is not None and other_val is not None:
+                setattr(self, item, torch.cat([self_val, other_val]))
         self.is_all_greedy &= other.is_all_greedy
         self.need_top_p_sampling |= other.need_top_p_sampling

sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc2py3-none-any.whl