sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
 - sglang/bench_one_batch_server.py +41 -25
 - sglang/bench_serving.py +378 -160
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +10 -15
 - sglang/profiler.py +18 -1
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +4 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +13 -64
 - sglang/srt/configs/load_config.py +25 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +136 -25
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +0 -10
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +5 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +20 -11
 - sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +4 -2
 - sglang/srt/disaggregation/decode.py +123 -31
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +157 -19
 - sglang/srt/disaggregation/nixl/conn.py +69 -24
 - sglang/srt/disaggregation/prefill.py +96 -270
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +63 -19
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +83 -80
 - sglang/srt/entrypoints/grpc_server.py +430 -234
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +195 -102
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +225 -37
 - sglang/srt/entrypoints/openai/serving_base.py +49 -2
 - sglang/srt/entrypoints/openai/serving_chat.py +29 -74
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +15 -1
 - sglang/srt/entrypoints/openai/serving_responses.py +5 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +58 -6
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +33 -4
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +20 -14
 - sglang/srt/function_call/glm4_moe_detector.py +1 -5
 - sglang/srt/function_call/gpt_oss_detector.py +1 -1
 - sglang/srt/function_call/json_array_parser.py +0 -2
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/function_call/utils.py +2 -2
 - sglang/srt/grpc/compile_proto.py +3 -3
 - sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
 - sglang/srt/layers/activation.py +10 -1
 - sglang/srt/layers/attention/aiter_backend.py +3 -3
 - sglang/srt/layers/attention/ascend_backend.py +17 -1
 - sglang/srt/layers/attention/attention_registry.py +43 -23
 - sglang/srt/layers/attention/base_attn_backend.py +20 -1
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +24 -10
 - sglang/srt/layers/attention/flashinfer_backend.py +258 -22
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
 - sglang/srt/layers/attention/flashmla_backend.py +2 -2
 - sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
 - sglang/srt/layers/attention/mamba/mamba.py +189 -241
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +0 -1
 - sglang/srt/layers/attention/nsa_backend.py +404 -90
 - sglang/srt/layers/attention/triton_backend.py +208 -34
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
 - sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
 - sglang/srt/layers/attention/utils.py +89 -7
 - sglang/srt/layers/attention/vision.py +3 -3
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +12 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +17 -0
 - sglang/srt/layers/layernorm.py +64 -19
 - sglang/srt/layers/linear.py +9 -1
 - sglang/srt/layers/logits_processor.py +152 -17
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
 - sglang/srt/layers/moe/ep_moe/layer.py +154 -625
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +6 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +7 -6
 - sglang/srt/layers/moe/utils.py +20 -5
 - sglang/srt/layers/quantization/__init__.py +5 -58
 - sglang/srt/layers/quantization/awq.py +183 -9
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +27 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +152 -81
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +42 -14
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +125 -100
 - sglang/srt/layers/quantization/mxfp4.py +35 -68
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +23 -48
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +87 -20
 - sglang/srt/layers/quantization/w8a8_int8.py +30 -24
 - sglang/srt/layers/radix_attention.py +62 -9
 - sglang/srt/layers/rotary_embedding.py +686 -17
 - sglang/srt/layers/sampler.py +47 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +0 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/triton_backend.py +0 -1
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora_manager.py +24 -9
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +40 -16
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
 - sglang/srt/managers/cache_controller.py +48 -17
 - sglang/srt/managers/data_parallel_controller.py +146 -42
 - sglang/srt/managers/detokenizer_manager.py +40 -13
 - sglang/srt/managers/io_struct.py +69 -16
 - sglang/srt/managers/mm_utils.py +20 -18
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
 - sglang/srt/managers/overlap_utils.py +96 -19
 - sglang/srt/managers/schedule_batch.py +241 -511
 - sglang/srt/managers/schedule_policy.py +15 -2
 - sglang/srt/managers/scheduler.py +420 -514
 - sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
 - sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
 - sglang/srt/managers/tokenizer_manager.py +375 -95
 - sglang/srt/managers/tp_worker.py +212 -161
 - sglang/srt/managers/utils.py +78 -2
 - sglang/srt/mem_cache/allocator.py +7 -2
 - sglang/srt/mem_cache/allocator_ascend.py +2 -2
 - sglang/srt/mem_cache/base_prefix_cache.py +2 -2
 - sglang/srt/mem_cache/chunk_cache.py +13 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +16 -1
 - sglang/srt/mem_cache/hicache_storage.py +11 -2
 - sglang/srt/mem_cache/hiradix_cache.py +16 -3
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +517 -219
 - sglang/srt/mem_cache/memory_pool_host.py +0 -1
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +53 -19
 - sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
 - sglang/srt/mem_cache/storage/backend_factory.py +2 -2
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +92 -26
 - sglang/srt/metrics/collector.py +31 -0
 - sglang/srt/metrics/func_timer.py +1 -1
 - sglang/srt/model_executor/cuda_graph_runner.py +43 -5
 - sglang/srt/model_executor/forward_batch_info.py +71 -25
 - sglang/srt/model_executor/model_runner.py +362 -270
 - sglang/srt/model_executor/npu_graph_runner.py +2 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +424 -27
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +47 -28
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +13 -52
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +19 -3
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +418 -140
 - sglang/srt/models/dots_ocr.py +0 -2
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +13 -19
 - sglang/srt/models/gemma3_mm.py +16 -0
 - sglang/srt/models/gemma3n_mm.py +1 -2
 - sglang/srt/models/glm4_moe.py +327 -382
 - sglang/srt/models/glm4_moe_nextn.py +6 -16
 - sglang/srt/models/glm4v.py +2 -1
 - sglang/srt/models/glm4v_moe.py +32 -199
 - sglang/srt/models/gpt_oss.py +5 -5
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +3 -1
 - sglang/srt/models/llama.py +2 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +5 -22
 - sglang/srt/models/longcat_flash_nextn.py +3 -14
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +13 -3
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen2_5_vl.py +3 -3
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +15 -12
 - sglang/srt/models/qwen2_vl.py +5 -2
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +19 -37
 - sglang/srt/models/qwen3_next.py +7 -12
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +37 -33
 - sglang/srt/models/qwen3_vl_moe.py +57 -185
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +0 -1
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/utils.py +11 -1
 - sglang/srt/multimodal/processors/base_processor.py +7 -2
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +0 -1
 - sglang/srt/multimodal/processors/glm4v.py +2 -6
 - sglang/srt/multimodal/processors/internvl.py +0 -2
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +75 -16
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/reasoning_parser.py +28 -2
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +17 -22
 - sglang/srt/sampling/sampling_params.py +70 -2
 - sglang/srt/server_args.py +846 -163
 - sglang/srt/server_args_config_parser.py +1 -1
 - sglang/srt/single_batch_overlap.py +36 -31
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
 - sglang/srt/speculative/eagle_info.py +57 -18
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +138 -0
 - sglang/srt/speculative/eagle_worker.py +83 -280
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
 - sglang/srt/speculative/ngram_worker.py +12 -11
 - sglang/srt/speculative/spec_info.py +2 -0
 - sglang/srt/speculative/spec_utils.py +38 -3
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/two_batch_overlap.py +28 -14
 - sglang/srt/utils/__init__.py +1 -1
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/utils/common.py +272 -82
 - sglang/srt/utils/hf_transformers_utils.py +44 -17
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +41 -0
 - sglang/test/runners.py +2 -0
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +3 -0
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/test_block_fp8.py +1 -2
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +463 -107
 - sglang/test/test_deterministic_utils.py +74 -0
 - sglang/test/test_disaggregation_utils.py +81 -0
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_utils.py +85 -20
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
 - sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
 - sglang/srt/models/vila.py +0 -306
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/utils/common.py
    CHANGED
    
    | 
         @@ -12,7 +12,6 @@ 
     | 
|
| 
       12 
12 
     | 
    
         
             
            # limitations under the License.
         
     | 
| 
       13 
13 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       14 
14 
     | 
    
         
             
            """Common utilities."""
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
15 
     | 
    
         
             
            from __future__ import annotations
         
     | 
| 
       17 
16 
     | 
    
         | 
| 
       18 
17 
     | 
    
         
             
            import argparse
         
     | 
| 
         @@ -43,6 +42,7 @@ import tempfile 
     | 
|
| 
       43 
42 
     | 
    
         
             
            import threading
         
     | 
| 
       44 
43 
     | 
    
         
             
            import time
         
     | 
| 
       45 
44 
     | 
    
         
             
            import traceback
         
     | 
| 
      
 45 
     | 
    
         
            +
            import types
         
     | 
| 
       46 
46 
     | 
    
         
             
            import uuid
         
     | 
| 
       47 
47 
     | 
    
         
             
            import warnings
         
     | 
| 
       48 
48 
     | 
    
         
             
            from collections import OrderedDict, defaultdict
         
     | 
| 
         @@ -63,6 +63,7 @@ from typing import ( 
     | 
|
| 
       63 
63 
     | 
    
         
             
                List,
         
     | 
| 
       64 
64 
     | 
    
         
             
                Optional,
         
     | 
| 
       65 
65 
     | 
    
         
             
                Protocol,
         
     | 
| 
      
 66 
     | 
    
         
            +
                Sequence,
         
     | 
| 
       66 
67 
     | 
    
         
             
                Set,
         
     | 
| 
       67 
68 
     | 
    
         
             
                Tuple,
         
     | 
| 
       68 
69 
     | 
    
         
             
                TypeVar,
         
     | 
| 
         @@ -70,6 +71,7 @@ from typing import ( 
     | 
|
| 
       70 
71 
     | 
    
         
             
            )
         
     | 
| 
       71 
72 
     | 
    
         | 
| 
       72 
73 
     | 
    
         
             
            import numpy as np
         
     | 
| 
      
 74 
     | 
    
         
            +
            import orjson
         
     | 
| 
       73 
75 
     | 
    
         
             
            import psutil
         
     | 
| 
       74 
76 
     | 
    
         
             
            import pybase64
         
     | 
| 
       75 
77 
     | 
    
         
             
            import requests
         
     | 
| 
         @@ -88,6 +90,7 @@ from torch.profiler import ProfilerActivity, profile, record_function 
     | 
|
| 
       88 
90 
     | 
    
         
             
            from torch.utils._contextlib import _DecoratorContextManager
         
     | 
| 
       89 
91 
     | 
    
         
             
            from typing_extensions import Literal
         
     | 
| 
       90 
92 
     | 
    
         | 
| 
      
 93 
     | 
    
         
            +
            from sglang.srt.environ import envs
         
     | 
| 
       91 
94 
     | 
    
         
             
            from sglang.srt.metrics.func_timer import enable_func_timer
         
     | 
| 
       92 
95 
     | 
    
         | 
| 
       93 
96 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
         @@ -131,6 +134,7 @@ def is_xpu() -> bool: 
     | 
|
| 
       131 
134 
     | 
    
         
             
                return hasattr(torch, "xpu") and torch.xpu.is_available()
         
     | 
| 
       132 
135 
     | 
    
         | 
| 
       133 
136 
     | 
    
         | 
| 
      
 137 
     | 
    
         
            +
            @lru_cache(maxsize=1)
         
     | 
| 
       134 
138 
     | 
    
         
             
            def is_npu() -> bool:
         
     | 
| 
       135 
139 
     | 
    
         
             
                return hasattr(torch, "npu") and torch.npu.is_available()
         
     | 
| 
       136 
140 
     | 
    
         | 
| 
         @@ -162,6 +166,20 @@ def _check(cc_major): 
     | 
|
| 
       162 
166 
     | 
    
         
             
                ) >= (12, 3)
         
     | 
| 
       163 
167 
     | 
    
         | 
| 
       164 
168 
     | 
    
         | 
| 
      
 169 
     | 
    
         
            +
            @contextmanager
         
     | 
| 
      
 170 
     | 
    
         
            +
            def device_context(device: torch.device):
         
     | 
| 
      
 171 
     | 
    
         
            +
                if device.type == "cpu" and is_cpu():
         
     | 
| 
      
 172 
     | 
    
         
            +
                    with torch.device("cpu"):
         
     | 
| 
      
 173 
     | 
    
         
            +
                        yield
         
     | 
| 
      
 174 
     | 
    
         
            +
                else:
         
     | 
| 
      
 175 
     | 
    
         
            +
                    module = torch.get_device_module(device)
         
     | 
| 
      
 176 
     | 
    
         
            +
                    if module is not None:
         
     | 
| 
      
 177 
     | 
    
         
            +
                        with module.device(device.index):
         
     | 
| 
      
 178 
     | 
    
         
            +
                            yield
         
     | 
| 
      
 179 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 180 
     | 
    
         
            +
                        raise ValueError(f"Unknown device module: {device}")
         
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
       165 
183 
     | 
    
         
             
            is_ampere_with_cuda_12_3 = lambda: _check(8)
         
     | 
| 
       166 
184 
     | 
    
         
             
            is_hopper_with_cuda_12_3 = lambda: _check(9)
         
     | 
| 
       167 
185 
     | 
    
         | 
| 
         @@ -173,6 +191,15 @@ def is_blackwell(): 
     | 
|
| 
       173 
191 
     | 
    
         
             
                return torch.cuda.get_device_capability()[0] == 10
         
     | 
| 
       174 
192 
     | 
    
         | 
| 
       175 
193 
     | 
    
         | 
| 
      
 194 
     | 
    
         
            +
            @lru_cache(maxsize=1)
         
     | 
| 
      
 195 
     | 
    
         
            +
            def is_sm120_supported(device=None) -> bool:
         
     | 
| 
      
 196 
     | 
    
         
            +
                if not is_cuda_alike():
         
     | 
| 
      
 197 
     | 
    
         
            +
                    return False
         
     | 
| 
      
 198 
     | 
    
         
            +
                return (torch.cuda.get_device_capability(device)[0] == 12) and (
         
     | 
| 
      
 199 
     | 
    
         
            +
                    torch.version.cuda >= "12.8"
         
     | 
| 
      
 200 
     | 
    
         
            +
                )
         
     | 
| 
      
 201 
     | 
    
         
            +
             
     | 
| 
      
 202 
     | 
    
         
            +
             
     | 
| 
       176 
203 
     | 
    
         
             
            @lru_cache(maxsize=1)
         
     | 
| 
       177 
204 
     | 
    
         
             
            def is_sm100_supported(device=None) -> bool:
         
     | 
| 
       178 
205 
     | 
    
         
             
                if not is_cuda_alike():
         
     | 
| 
         @@ -228,7 +255,7 @@ def support_triton(backend: str) -> bool: 
     | 
|
| 
       228 
255 
     | 
    
         | 
| 
       229 
256 
     | 
    
         | 
| 
       230 
257 
     | 
    
         
             
            try:
         
     | 
| 
       231 
     | 
    
         
            -
                import sgl_kernel
         
     | 
| 
      
 258 
     | 
    
         
            +
                import sgl_kernel  # noqa: F401
         
     | 
| 
       232 
259 
     | 
    
         | 
| 
       233 
260 
     | 
    
         
             
                is_intel_amx_backend_available = hasattr(
         
     | 
| 
       234 
261 
     | 
    
         
             
                    torch.ops.sgl_kernel, "convert_weight_packed"
         
     | 
| 
         @@ -253,6 +280,14 @@ def use_intel_amx_backend(layer): 
     | 
|
| 
       253 
280 
     | 
    
         
             
                return getattr(layer, "use_intel_amx_backend", False)
         
     | 
| 
       254 
281 
     | 
    
         | 
| 
       255 
282 
     | 
    
         | 
| 
      
 283 
     | 
    
         
            +
            def xpu_has_xmx_support():
         
     | 
| 
      
 284 
     | 
    
         
            +
                # TODO: update with XPU capalibity query
         
     | 
| 
      
 285 
     | 
    
         
            +
                if is_xpu():
         
     | 
| 
      
 286 
     | 
    
         
            +
                    # currently only PVC/LNL/BMG supports F64, so we only support these now
         
     | 
| 
      
 287 
     | 
    
         
            +
                    return torch.xpu.get_device_properties().has_fp64
         
     | 
| 
      
 288 
     | 
    
         
            +
                return False
         
     | 
| 
      
 289 
     | 
    
         
            +
             
     | 
| 
      
 290 
     | 
    
         
            +
             
     | 
| 
       256 
291 
     | 
    
         
             
            def is_flashinfer_available():
         
     | 
| 
       257 
292 
     | 
    
         
             
                """
         
     | 
| 
       258 
293 
     | 
    
         
             
                Check whether flashinfer is available.
         
     | 
| 
         @@ -263,6 +298,17 @@ def is_flashinfer_available(): 
     | 
|
| 
       263 
298 
     | 
    
         
             
                return importlib.util.find_spec("flashinfer") is not None and is_cuda()
         
     | 
| 
       264 
299 
     | 
    
         | 
| 
       265 
300 
     | 
    
         | 
| 
      
 301 
     | 
    
         
            +
            def is_nvidia_cublas_cu12_version_ge_12_9():
         
     | 
| 
      
 302 
     | 
    
         
            +
                """
         
     | 
| 
      
 303 
     | 
    
         
            +
                temporary fix for issue #11272
         
     | 
| 
      
 304 
     | 
    
         
            +
                """
         
     | 
| 
      
 305 
     | 
    
         
            +
                try:
         
     | 
| 
      
 306 
     | 
    
         
            +
                    installed_version = version("nvidia-cublas-cu12")
         
     | 
| 
      
 307 
     | 
    
         
            +
                except PackageNotFoundError:
         
     | 
| 
      
 308 
     | 
    
         
            +
                    return False
         
     | 
| 
      
 309 
     | 
    
         
            +
                return pkg_version.parse(installed_version) >= pkg_version.parse("12.9")
         
     | 
| 
      
 310 
     | 
    
         
            +
             
     | 
| 
      
 311 
     | 
    
         
            +
             
     | 
| 
       266 
312 
     | 
    
         
             
            def random_uuid() -> str:
         
     | 
| 
       267 
313 
     | 
    
         
             
                return str(uuid.uuid4().hex)
         
     | 
| 
       268 
314 
     | 
    
         | 
| 
         @@ -409,7 +455,15 @@ def get_available_gpu_memory( 
     | 
|
| 
       409 
455 
     | 
    
         | 
| 
       410 
456 
     | 
    
         
             
                    if empty_cache:
         
     | 
| 
       411 
457 
     | 
    
         
             
                        torch.cuda.empty_cache()
         
     | 
| 
       412 
     | 
    
         
            -
                     
     | 
| 
      
 458 
     | 
    
         
            +
                    SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121)  # Orin, Thor, Spark
         
     | 
| 
      
 459 
     | 
    
         
            +
                    if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
         
     | 
| 
      
 460 
     | 
    
         
            +
                        # On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
         
     | 
| 
      
 461 
     | 
    
         
            +
                        # only reports "free" memory, which can be lower than what is actually
         
     | 
| 
      
 462 
     | 
    
         
            +
                        # available due to not including cache memory. So we use the system available
         
     | 
| 
      
 463 
     | 
    
         
            +
                        # memory metric instead.
         
     | 
| 
      
 464 
     | 
    
         
            +
                        free_gpu_memory = psutil.virtual_memory().available
         
     | 
| 
      
 465 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 466 
     | 
    
         
            +
                        free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
         
     | 
| 
       413 
467 
     | 
    
         | 
| 
       414 
468 
     | 
    
         
             
                elif device == "xpu":
         
     | 
| 
       415 
469 
     | 
    
         
             
                    num_gpus = torch.xpu.device_count()
         
     | 
| 
         @@ -453,6 +507,8 @@ def get_available_gpu_memory( 
     | 
|
| 
       453 
507 
     | 
    
         
             
                            f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
         
     | 
| 
       454 
508 
     | 
    
         
             
                            "which may cause useless memory allocation for torch NPU context.",
         
     | 
| 
       455 
509 
     | 
    
         
             
                        )
         
     | 
| 
      
 510 
     | 
    
         
            +
                    if empty_cache:
         
     | 
| 
      
 511 
     | 
    
         
            +
                        torch.npu.empty_cache()
         
     | 
| 
       456 
512 
     | 
    
         
             
                    free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
         
     | 
| 
       457 
513 
     | 
    
         | 
| 
       458 
514 
     | 
    
         
             
                if distributed:
         
     | 
| 
         @@ -481,13 +537,13 @@ def make_layers( 
     | 
|
| 
       481 
537 
     | 
    
         
             
                pp_size: Optional[int] = None,
         
     | 
| 
       482 
538 
     | 
    
         
             
                prefix: str = "",
         
     | 
| 
       483 
539 
     | 
    
         
             
                return_tuple: bool = False,
         
     | 
| 
       484 
     | 
    
         
            -
                offloader_kwargs: Dict[str, Any] =  
     | 
| 
      
 540 
     | 
    
         
            +
                offloader_kwargs: Optional[Dict[str, Any]] = None,
         
     | 
| 
       485 
541 
     | 
    
         
             
            ) -> Tuple[torch.nn.Module, int, int]:
         
     | 
| 
       486 
542 
     | 
    
         
             
                """Make a list of layers with the given layer function"""
         
     | 
| 
       487 
543 
     | 
    
         
             
                # circula imports
         
     | 
| 
       488 
544 
     | 
    
         
             
                from sglang.srt.distributed import get_pp_indices
         
     | 
| 
       489 
545 
     | 
    
         
             
                from sglang.srt.layers.utils import PPMissingLayer
         
     | 
| 
       490 
     | 
    
         
            -
                from sglang.srt.offloader import get_offloader
         
     | 
| 
      
 546 
     | 
    
         
            +
                from sglang.srt.utils.offloader import get_offloader
         
     | 
| 
       491 
547 
     | 
    
         | 
| 
       492 
548 
     | 
    
         
             
                assert not pp_size or num_hidden_layers >= pp_size
         
     | 
| 
       493 
549 
     | 
    
         
             
                start_layer, end_layer = (
         
     | 
| 
         @@ -506,7 +562,7 @@ def make_layers( 
     | 
|
| 
       506 
562 
     | 
    
         
             
                            layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
         
     | 
| 
       507 
563 
     | 
    
         
             
                            for idx in range(start_layer, end_layer)
         
     | 
| 
       508 
564 
     | 
    
         
             
                        ),
         
     | 
| 
       509 
     | 
    
         
            -
                        **offloader_kwargs,
         
     | 
| 
      
 565 
     | 
    
         
            +
                        **(offloader_kwargs or {}),
         
     | 
| 
       510 
566 
     | 
    
         
             
                    )
         
     | 
| 
       511 
567 
     | 
    
         
             
                    + [
         
     | 
| 
       512 
568 
     | 
    
         
             
                        PPMissingLayer(return_tuple=return_tuple)
         
     | 
| 
         @@ -518,6 +574,24 @@ def make_layers( 
     | 
|
| 
       518 
574 
     | 
    
         
             
                return modules, start_layer, end_layer
         
     | 
| 
       519 
575 
     | 
    
         | 
| 
       520 
576 
     | 
    
         | 
| 
      
 577 
     | 
    
         
            +
            def make_layers_non_pp(
         
     | 
| 
      
 578 
     | 
    
         
            +
                num_hidden_layers: int,
         
     | 
| 
      
 579 
     | 
    
         
            +
                layer_fn: LayerFn,
         
     | 
| 
      
 580 
     | 
    
         
            +
                prefix: str = "",
         
     | 
| 
      
 581 
     | 
    
         
            +
            ) -> torch.nn.ModuleList:
         
     | 
| 
      
 582 
     | 
    
         
            +
                from sglang.srt.utils.offloader import get_offloader
         
     | 
| 
      
 583 
     | 
    
         
            +
             
     | 
| 
      
 584 
     | 
    
         
            +
                layers = torch.nn.ModuleList(
         
     | 
| 
      
 585 
     | 
    
         
            +
                    get_offloader().wrap_modules(
         
     | 
| 
      
 586 
     | 
    
         
            +
                        (
         
     | 
| 
      
 587 
     | 
    
         
            +
                            layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
         
     | 
| 
      
 588 
     | 
    
         
            +
                            for idx in range(num_hidden_layers)
         
     | 
| 
      
 589 
     | 
    
         
            +
                        )
         
     | 
| 
      
 590 
     | 
    
         
            +
                    )
         
     | 
| 
      
 591 
     | 
    
         
            +
                )
         
     | 
| 
      
 592 
     | 
    
         
            +
                return layers
         
     | 
| 
      
 593 
     | 
    
         
            +
             
     | 
| 
      
 594 
     | 
    
         
            +
             
     | 
| 
       521 
595 
     | 
    
         
             
            cmo_stream = None
         
     | 
| 
       522 
596 
     | 
    
         | 
| 
       523 
597 
     | 
    
         | 
| 
         @@ -811,9 +885,9 @@ def get_image_bytes(image_file: Union[str, bytes]): 
     | 
|
| 
       811 
885 
     | 
    
         
             
                        return f.read()
         
     | 
| 
       812 
886 
     | 
    
         
             
                elif image_file.startswith("data:"):
         
     | 
| 
       813 
887 
     | 
    
         
             
                    image_file = image_file.split(",")[1]
         
     | 
| 
       814 
     | 
    
         
            -
                    return pybase64.b64decode(image_file)
         
     | 
| 
      
 888 
     | 
    
         
            +
                    return pybase64.b64decode(image_file, validate=True)
         
     | 
| 
       815 
889 
     | 
    
         
             
                elif isinstance(image_file, str):
         
     | 
| 
       816 
     | 
    
         
            -
                    return pybase64.b64decode(image_file)
         
     | 
| 
      
 890 
     | 
    
         
            +
                    return pybase64.b64decode(image_file, validate=True)
         
     | 
| 
       817 
891 
     | 
    
         
             
                else:
         
     | 
| 
       818 
892 
     | 
    
         
             
                    raise NotImplementedError(f"Invalid image: {image_file}")
         
     | 
| 
       819 
893 
     | 
    
         | 
| 
         @@ -850,7 +924,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True): 
     | 
|
| 
       850 
924 
     | 
    
         
             
                            vr = VideoReader(tmp_file.name, ctx=ctx)
         
     | 
| 
       851 
925 
     | 
    
         
             
                        elif video_file.startswith("data:"):
         
     | 
| 
       852 
926 
     | 
    
         
             
                            _, encoded = video_file.split(",", 1)
         
     | 
| 
       853 
     | 
    
         
            -
                            video_bytes = pybase64.b64decode(encoded)
         
     | 
| 
      
 927 
     | 
    
         
            +
                            video_bytes = pybase64.b64decode(encoded, validate=True)
         
     | 
| 
       854 
928 
     | 
    
         
             
                            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         
     | 
| 
       855 
929 
     | 
    
         
             
                            tmp_file.write(video_bytes)
         
     | 
| 
       856 
930 
     | 
    
         
             
                            tmp_file.close()
         
     | 
| 
         @@ -858,7 +932,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True): 
     | 
|
| 
       858 
932 
     | 
    
         
             
                        elif os.path.isfile(video_file):
         
     | 
| 
       859 
933 
     | 
    
         
             
                            vr = VideoReader(video_file, ctx=ctx)
         
     | 
| 
       860 
934 
     | 
    
         
             
                        else:
         
     | 
| 
       861 
     | 
    
         
            -
                            video_bytes = pybase64.b64decode(video_file)
         
     | 
| 
      
 935 
     | 
    
         
            +
                            video_bytes = pybase64.b64decode(video_file, validate=True)
         
     | 
| 
       862 
936 
     | 
    
         
             
                            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
         
     | 
| 
       863 
937 
     | 
    
         
             
                            tmp_file.write(video_bytes)
         
     | 
| 
       864 
938 
     | 
    
         
             
                            tmp_file.close()
         
     | 
| 
         @@ -992,32 +1066,6 @@ def monkey_patch_p2p_access_check(): 
     | 
|
| 
       992 
1066 
     | 
    
         
             
                setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
         
     | 
| 
       993 
1067 
     | 
    
         | 
| 
       994 
1068 
     | 
    
         | 
| 
       995 
     | 
    
         
            -
            def monkey_patch_vllm_gguf_config():
         
     | 
| 
       996 
     | 
    
         
            -
                try:
         
     | 
| 
       997 
     | 
    
         
            -
                    from vllm.model_executor.layers.quantization.gguf import (
         
     | 
| 
       998 
     | 
    
         
            -
                        GGUFConfig,
         
     | 
| 
       999 
     | 
    
         
            -
                        GGUFEmbeddingMethod,
         
     | 
| 
       1000 
     | 
    
         
            -
                        GGUFLinearMethod,
         
     | 
| 
       1001 
     | 
    
         
            -
                    )
         
     | 
| 
       1002 
     | 
    
         
            -
                except ImportError:
         
     | 
| 
       1003 
     | 
    
         
            -
                    return
         
     | 
| 
       1004 
     | 
    
         
            -
             
     | 
| 
       1005 
     | 
    
         
            -
                from sglang.srt.layers.linear import LinearBase
         
     | 
| 
       1006 
     | 
    
         
            -
                from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
         
     | 
| 
       1007 
     | 
    
         
            -
             
     | 
| 
       1008 
     | 
    
         
            -
                def get_quant_method_with_embedding_replaced(
         
     | 
| 
       1009 
     | 
    
         
            -
                    self, layer: torch.nn.Module, prefix: str
         
     | 
| 
       1010 
     | 
    
         
            -
                ) -> Optional["QuantizeMethodBase"]:
         
     | 
| 
       1011 
     | 
    
         
            -
                    if isinstance(layer, LinearBase):
         
     | 
| 
       1012 
     | 
    
         
            -
                        return GGUFLinearMethod(self)
         
     | 
| 
       1013 
     | 
    
         
            -
                    elif isinstance(layer, VocabParallelEmbedding):
         
     | 
| 
       1014 
     | 
    
         
            -
                        # patch to own VocabParallelEmbedding
         
     | 
| 
       1015 
     | 
    
         
            -
                        return GGUFEmbeddingMethod(self)
         
     | 
| 
       1016 
     | 
    
         
            -
                    return None
         
     | 
| 
       1017 
     | 
    
         
            -
             
     | 
| 
       1018 
     | 
    
         
            -
                setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
         
     | 
| 
       1019 
     | 
    
         
            -
             
     | 
| 
       1020 
     | 
    
         
            -
             
     | 
| 
       1021 
1069 
     | 
    
         
             
            def set_ulimit(target_soft_limit=65535):
         
     | 
| 
       1022 
1070 
     | 
    
         
             
                # number of open files
         
     | 
| 
       1023 
1071 
     | 
    
         
             
                resource_type = resource.RLIMIT_NOFILE
         
     | 
| 
         @@ -1054,9 +1102,9 @@ def add_api_key_middleware(app, api_key: str): 
     | 
|
| 
       1054 
1102 
     | 
    
         
             
                async def authentication(request, call_next):
         
     | 
| 
       1055 
1103 
     | 
    
         
             
                    if request.method == "OPTIONS":
         
     | 
| 
       1056 
1104 
     | 
    
         
             
                        return await call_next(request)
         
     | 
| 
       1057 
     | 
    
         
            -
                    if request.url.path.startswith("/health") 
     | 
| 
       1058 
     | 
    
         
            -
                         
     | 
| 
       1059 
     | 
    
         
            -
                     
     | 
| 
      
 1105 
     | 
    
         
            +
                    if request.url.path.startswith("/health") or request.url.path.startswith(
         
     | 
| 
      
 1106 
     | 
    
         
            +
                        "/metrics"
         
     | 
| 
      
 1107 
     | 
    
         
            +
                    ):
         
     | 
| 
       1060 
1108 
     | 
    
         
             
                        return await call_next(request)
         
     | 
| 
       1061 
1109 
     | 
    
         
             
                    if request.headers.get("Authorization") != "Bearer " + api_key:
         
     | 
| 
       1062 
1110 
     | 
    
         
             
                        return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
         
     | 
| 
         @@ -1083,7 +1131,7 @@ def configure_logger(server_args, prefix: str = ""): 
     | 
|
| 
       1083 
1131 
     | 
    
         
             
                            f"{SGLANG_LOGGING_CONFIG_PATH} but it does not exist!"
         
     | 
| 
       1084 
1132 
     | 
    
         
             
                        )
         
     | 
| 
       1085 
1133 
     | 
    
         
             
                    with open(SGLANG_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
         
     | 
| 
       1086 
     | 
    
         
            -
                        custom_config =  
     | 
| 
      
 1134 
     | 
    
         
            +
                        custom_config = orjson.loads(file.read())
         
     | 
| 
       1087 
1135 
     | 
    
         
             
                    logging.config.dictConfig(custom_config)
         
     | 
| 
       1088 
1136 
     | 
    
         
             
                    return
         
     | 
| 
       1089 
1137 
     | 
    
         
             
                format = f"[%(asctime)s{prefix}] %(message)s"
         
     | 
| 
         @@ -1262,8 +1310,46 @@ def pytorch_profile(name, func, *args, data_size=-1): 
     | 
|
| 
       1262 
1310 
     | 
    
         | 
| 
       1263 
1311 
     | 
    
         | 
| 
       1264 
1312 
     | 
    
         
             
            def get_zmq_socket(
         
     | 
| 
       1265 
     | 
    
         
            -
                context: zmq.Context, 
     | 
| 
       1266 
     | 
    
         
            -
             
     | 
| 
      
 1313 
     | 
    
         
            +
                context: zmq.Context,
         
     | 
| 
      
 1314 
     | 
    
         
            +
                socket_type: zmq.SocketType,
         
     | 
| 
      
 1315 
     | 
    
         
            +
                endpoint: Optional[str] = None,
         
     | 
| 
      
 1316 
     | 
    
         
            +
                bind: bool = True,
         
     | 
| 
      
 1317 
     | 
    
         
            +
            ) -> Union[zmq.Socket, Tuple[int, zmq.Socket]]:
         
     | 
| 
      
 1318 
     | 
    
         
            +
                """Create and configure a ZeroMQ socket.
         
     | 
| 
      
 1319 
     | 
    
         
            +
             
     | 
| 
      
 1320 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 1321 
     | 
    
         
            +
                    context: ZeroMQ context to create the socket from.
         
     | 
| 
      
 1322 
     | 
    
         
            +
                    socket_type: Type of ZeroMQ socket to create.
         
     | 
| 
      
 1323 
     | 
    
         
            +
                    endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
         
     | 
| 
      
 1324 
     | 
    
         
            +
                    bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.
         
     | 
| 
      
 1325 
     | 
    
         
            +
             
     | 
| 
      
 1326 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 1327 
     | 
    
         
            +
                    If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
         
     | 
| 
      
 1328 
     | 
    
         
            +
                    If endpoint is provided: The configured ZeroMQ socket.
         
     | 
| 
      
 1329 
     | 
    
         
            +
                """
         
     | 
| 
      
 1330 
     | 
    
         
            +
                socket = context.socket(socket_type)
         
     | 
| 
      
 1331 
     | 
    
         
            +
             
     | 
| 
      
 1332 
     | 
    
         
            +
                if endpoint is None:
         
     | 
| 
      
 1333 
     | 
    
         
            +
                    # Bind to random TCP port
         
     | 
| 
      
 1334 
     | 
    
         
            +
                    config_socket(socket, socket_type)
         
     | 
| 
      
 1335 
     | 
    
         
            +
                    port = socket.bind_to_random_port("tcp://*")
         
     | 
| 
      
 1336 
     | 
    
         
            +
                    return port, socket
         
     | 
| 
      
 1337 
     | 
    
         
            +
                else:
         
     | 
| 
      
 1338 
     | 
    
         
            +
                    # Handle IPv6 if endpoint contains brackets
         
     | 
| 
      
 1339 
     | 
    
         
            +
                    if endpoint.find("[") != -1:
         
     | 
| 
      
 1340 
     | 
    
         
            +
                        socket.setsockopt(zmq.IPV6, 1)
         
     | 
| 
      
 1341 
     | 
    
         
            +
             
     | 
| 
      
 1342 
     | 
    
         
            +
                    config_socket(socket, socket_type)
         
     | 
| 
      
 1343 
     | 
    
         
            +
             
     | 
| 
      
 1344 
     | 
    
         
            +
                    if bind:
         
     | 
| 
      
 1345 
     | 
    
         
            +
                        socket.bind(endpoint)
         
     | 
| 
      
 1346 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1347 
     | 
    
         
            +
                        socket.connect(endpoint)
         
     | 
| 
      
 1348 
     | 
    
         
            +
             
     | 
| 
      
 1349 
     | 
    
         
            +
                    return socket
         
     | 
| 
      
 1350 
     | 
    
         
            +
             
     | 
| 
      
 1351 
     | 
    
         
            +
             
     | 
| 
      
 1352 
     | 
    
         
            +
            def config_socket(socket, socket_type: zmq.SocketType):
         
     | 
| 
       1267 
1353 
     | 
    
         
             
                mem = psutil.virtual_memory()
         
     | 
| 
       1268 
1354 
     | 
    
         
             
                total_mem = mem.total / 1024**3
         
     | 
| 
       1269 
1355 
     | 
    
         
             
                available_mem = mem.available / 1024**3
         
     | 
| 
         @@ -1272,10 +1358,6 @@ def get_zmq_socket( 
     | 
|
| 
       1272 
1358 
     | 
    
         
             
                else:
         
     | 
| 
       1273 
1359 
     | 
    
         
             
                    buf_size = -1
         
     | 
| 
       1274 
1360 
     | 
    
         | 
| 
       1275 
     | 
    
         
            -
                socket = context.socket(socket_type)
         
     | 
| 
       1276 
     | 
    
         
            -
                if endpoint.find("[") != -1:
         
     | 
| 
       1277 
     | 
    
         
            -
                    socket.setsockopt(zmq.IPV6, 1)
         
     | 
| 
       1278 
     | 
    
         
            -
             
     | 
| 
       1279 
1361 
     | 
    
         
             
                def set_send_opt():
         
     | 
| 
       1280 
1362 
     | 
    
         
             
                    socket.setsockopt(zmq.SNDHWM, 0)
         
     | 
| 
       1281 
1363 
     | 
    
         
             
                    socket.setsockopt(zmq.SNDBUF, buf_size)
         
     | 
| 
         @@ -1288,19 +1370,12 @@ def get_zmq_socket( 
     | 
|
| 
       1288 
1370 
     | 
    
         
             
                    set_send_opt()
         
     | 
| 
       1289 
1371 
     | 
    
         
             
                elif socket_type == zmq.PULL:
         
     | 
| 
       1290 
1372 
     | 
    
         
             
                    set_recv_opt()
         
     | 
| 
       1291 
     | 
    
         
            -
                elif socket_type  
     | 
| 
      
 1373 
     | 
    
         
            +
                elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
         
     | 
| 
       1292 
1374 
     | 
    
         
             
                    set_send_opt()
         
     | 
| 
       1293 
1375 
     | 
    
         
             
                    set_recv_opt()
         
     | 
| 
       1294 
1376 
     | 
    
         
             
                else:
         
     | 
| 
       1295 
1377 
     | 
    
         
             
                    raise ValueError(f"Unsupported socket type: {socket_type}")
         
     | 
| 
       1296 
1378 
     | 
    
         | 
| 
       1297 
     | 
    
         
            -
                if bind:
         
     | 
| 
       1298 
     | 
    
         
            -
                    socket.bind(endpoint)
         
     | 
| 
       1299 
     | 
    
         
            -
                else:
         
     | 
| 
       1300 
     | 
    
         
            -
                    socket.connect(endpoint)
         
     | 
| 
       1301 
     | 
    
         
            -
             
     | 
| 
       1302 
     | 
    
         
            -
                return socket
         
     | 
| 
       1303 
     | 
    
         
            -
             
     | 
| 
       1304 
1379 
     | 
    
         | 
| 
       1305 
1380 
     | 
    
         
             
            def dump_to_file(dirpath, name, value):
         
     | 
| 
       1306 
1381 
     | 
    
         
             
                from sglang.srt.distributed import get_tensor_model_parallel_rank
         
     | 
| 
         @@ -1500,7 +1575,7 @@ def get_hpu_memory_capacity(): 
     | 
|
| 
       1500 
1575 
     | 
    
         | 
| 
       1501 
1576 
     | 
    
         
             
            def get_npu_memory_capacity():
         
     | 
| 
       1502 
1577 
     | 
    
         
             
                try:
         
     | 
| 
       1503 
     | 
    
         
            -
                    import torch_npu
         
     | 
| 
      
 1578 
     | 
    
         
            +
                    import torch_npu  # noqa: F401
         
     | 
| 
       1504 
1579 
     | 
    
         | 
| 
       1505 
1580 
     | 
    
         
             
                    return torch.npu.mem_get_info()[1] // 1024 // 1024  # unit: MB
         
     | 
| 
       1506 
1581 
     | 
    
         
             
                except ImportError as e:
         
     | 
| 
         @@ -1521,13 +1596,18 @@ def get_cpu_memory_capacity(): 
     | 
|
| 
       1521 
1596 
     | 
    
         
             
                    for numa_id in range(n_numa_node):
         
     | 
| 
       1522 
1597 
     | 
    
         
             
                        file_meminfo = f"node{numa_id}/meminfo"
         
     | 
| 
       1523 
1598 
     | 
    
         
             
                        with open(os.path.join(file_prefix, file_meminfo), "r") as f:
         
     | 
| 
       1524 
     | 
    
         
            -
                            # 1st line 
     | 
| 
       1525 
     | 
    
         
            -
                            line = f. 
     | 
| 
       1526 
     | 
    
         
            -
                             
     | 
| 
      
 1599 
     | 
    
         
            +
                            # MemTotal info is at the 1st line
         
     | 
| 
      
 1600 
     | 
    
         
            +
                            line = f.readline()
         
     | 
| 
      
 1601 
     | 
    
         
            +
                            # Expected format: "Node 0 MemTotal:       100000000 kB"
         
     | 
| 
      
 1602 
     | 
    
         
            +
                            parts = line.split()
         
     | 
| 
      
 1603 
     | 
    
         
            +
                            if len(parts) >= 4 and parts[2] == "MemTotal:":
         
     | 
| 
      
 1604 
     | 
    
         
            +
                                numa_mem_list.append(int(parts[3]))
         
     | 
| 
      
 1605 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1606 
     | 
    
         
            +
                                raise ValueError(f"Unexpected format in {file_meminfo}: {line}")
         
     | 
| 
       1527 
1607 
     | 
    
         
             
                    # Retrieved value in KB, need MB
         
     | 
| 
       1528 
1608 
     | 
    
         
             
                    numa_mem = float(min(numa_mem_list) // 1024)
         
     | 
| 
       1529 
1609 
     | 
    
         
             
                    return numa_mem
         
     | 
| 
       1530 
     | 
    
         
            -
                except FileNotFoundError:
         
     | 
| 
      
 1610 
     | 
    
         
            +
                except (FileNotFoundError, ValueError, IndexError):
         
     | 
| 
       1531 
1611 
     | 
    
         
             
                    numa_mem = psutil.virtual_memory().total / n_numa_node
         
     | 
| 
       1532 
1612 
     | 
    
         
             
                    # Retrieved value in Byte, need MB
         
     | 
| 
       1533 
1613 
     | 
    
         
             
                    return float(numa_mem // (1 << 20))
         
     | 
| 
         @@ -1687,7 +1767,7 @@ def get_device(device_id: Optional[int] = None) -> str: 
     | 
|
| 
       1687 
1767 
     | 
    
         | 
| 
       1688 
1768 
     | 
    
         
             
                if is_habana_available():
         
     | 
| 
       1689 
1769 
     | 
    
         
             
                    try:
         
     | 
| 
       1690 
     | 
    
         
            -
                        import habana_frameworks.torch.hpu
         
     | 
| 
      
 1770 
     | 
    
         
            +
                        import habana_frameworks.torch.hpu  # noqa: F401
         
     | 
| 
       1691 
1771 
     | 
    
         | 
| 
       1692 
1772 
     | 
    
         
             
                        if torch.hpu.is_available():
         
     | 
| 
       1693 
1773 
     | 
    
         
             
                            if device_id == None:
         
     | 
| 
         @@ -1717,7 +1797,7 @@ def get_device_count() -> int: 
     | 
|
| 
       1717 
1797 
     | 
    
         | 
| 
       1718 
1798 
     | 
    
         
             
                if is_habana_available():
         
     | 
| 
       1719 
1799 
     | 
    
         
             
                    try:
         
     | 
| 
       1720 
     | 
    
         
            -
                        import habana_frameworks.torch.hpu
         
     | 
| 
      
 1800 
     | 
    
         
            +
                        import habana_frameworks.torch.hpu  # noqa: F401
         
     | 
| 
       1721 
1801 
     | 
    
         | 
| 
       1722 
1802 
     | 
    
         
             
                        if torch.hpu.is_available():
         
     | 
| 
       1723 
1803 
     | 
    
         
             
                            return torch.hpu.device_count()
         
     | 
| 
         @@ -1860,7 +1940,9 @@ def direct_register_custom_op( 
     | 
|
| 
       1860 
1940 
     | 
    
         
             
                    if fake_impl is not None:
         
     | 
| 
       1861 
1941 
     | 
    
         
             
                        my_lib._register_fake(op_name, fake_impl)
         
     | 
| 
       1862 
1942 
     | 
    
         
             
                except RuntimeError as error:
         
     | 
| 
       1863 
     | 
    
         
            -
                    if "Tried to register an operator" in str( 
     | 
| 
      
 1943 
     | 
    
         
            +
                    if "Tried to register an operator" in str(error) and "multiple times" in str(
         
     | 
| 
      
 1944 
     | 
    
         
            +
                        error
         
     | 
| 
      
 1945 
     | 
    
         
            +
                    ):
         
     | 
| 
       1864 
1946 
     | 
    
         
             
                        # Silently ignore duplicate registration errors
         
     | 
| 
       1865 
1947 
     | 
    
         
             
                        # This can happen in multi-engine scenarios
         
     | 
| 
       1866 
1948 
     | 
    
         
             
                        pass
         
     | 
| 
         @@ -1873,6 +1955,7 @@ def direct_register_custom_op( 
     | 
|
| 
       1873 
1955 
     | 
    
         | 
| 
       1874 
1956 
     | 
    
         | 
| 
       1875 
1957 
     | 
    
         
             
            def set_gpu_proc_affinity(
         
     | 
| 
      
 1958 
     | 
    
         
            +
                pp_size: int,
         
     | 
| 
       1876 
1959 
     | 
    
         
             
                tp_size: int,
         
     | 
| 
       1877 
1960 
     | 
    
         
             
                nnodes: int,
         
     | 
| 
       1878 
1961 
     | 
    
         
             
                gpu_id: int,
         
     | 
| 
         @@ -1881,7 +1964,8 @@ def set_gpu_proc_affinity( 
     | 
|
| 
       1881 
1964 
     | 
    
         
             
                pid = os.getpid()
         
     | 
| 
       1882 
1965 
     | 
    
         
             
                p = psutil.Process(pid)
         
     | 
| 
       1883 
1966 
     | 
    
         | 
| 
       1884 
     | 
    
         
            -
                 
     | 
| 
      
 1967 
     | 
    
         
            +
                nnodes_per_tp_group = max(nnodes // pp_size, 1)
         
     | 
| 
      
 1968 
     | 
    
         
            +
                tp_size_per_node = tp_size // nnodes_per_tp_group
         
     | 
| 
       1885 
1969 
     | 
    
         | 
| 
       1886 
1970 
     | 
    
         
             
                # total physical cores
         
     | 
| 
       1887 
1971 
     | 
    
         
             
                total_pcores = psutil.cpu_count(logical=False)
         
     | 
| 
         @@ -2012,7 +2096,78 @@ class MultiprocessingSerializer: 
     | 
|
| 
       2012 
2096 
     | 
    
         
             
                        # Decode base64 string to bytes
         
     | 
| 
       2013 
2097 
     | 
    
         
             
                        data = pybase64.b64decode(data, validate=True)
         
     | 
| 
       2014 
2098 
     | 
    
         | 
| 
       2015 
     | 
    
         
            -
                    return  
     | 
| 
      
 2099 
     | 
    
         
            +
                    return SafeUnpickler(io.BytesIO(data)).load()
         
     | 
| 
      
 2100 
     | 
    
         
            +
             
     | 
| 
      
 2101 
     | 
    
         
            +
             
     | 
| 
      
 2102 
     | 
    
         
            +
            class SafeUnpickler(pickle.Unpickler):
         
     | 
| 
      
 2103 
     | 
    
         
            +
                ALLOWED_MODULE_PREFIXES = {
         
     | 
| 
      
 2104 
     | 
    
         
            +
                    # --- Python types ---
         
     | 
| 
      
 2105 
     | 
    
         
            +
                    "builtins.",
         
     | 
| 
      
 2106 
     | 
    
         
            +
                    "collections.",
         
     | 
| 
      
 2107 
     | 
    
         
            +
                    "copyreg.",
         
     | 
| 
      
 2108 
     | 
    
         
            +
                    "functools.",
         
     | 
| 
      
 2109 
     | 
    
         
            +
                    "itertools.",
         
     | 
| 
      
 2110 
     | 
    
         
            +
                    "operator.",
         
     | 
| 
      
 2111 
     | 
    
         
            +
                    "types.",
         
     | 
| 
      
 2112 
     | 
    
         
            +
                    "weakref.",
         
     | 
| 
      
 2113 
     | 
    
         
            +
                    # --- PyTorch types ---
         
     | 
| 
      
 2114 
     | 
    
         
            +
                    "torch.",
         
     | 
| 
      
 2115 
     | 
    
         
            +
                    "torch._tensor.",
         
     | 
| 
      
 2116 
     | 
    
         
            +
                    "torch.storage.",
         
     | 
| 
      
 2117 
     | 
    
         
            +
                    "torch.nn.parameter.",
         
     | 
| 
      
 2118 
     | 
    
         
            +
                    "torch.autograd.function.",
         
     | 
| 
      
 2119 
     | 
    
         
            +
                    # --- torch distributed ---
         
     | 
| 
      
 2120 
     | 
    
         
            +
                    "torch.distributed.",
         
     | 
| 
      
 2121 
     | 
    
         
            +
                    "torch.distributed._shard.",
         
     | 
| 
      
 2122 
     | 
    
         
            +
                    "torch.distributed._composable.",
         
     | 
| 
      
 2123 
     | 
    
         
            +
                    "torch._C._distributed_c10d.",
         
     | 
| 
      
 2124 
     | 
    
         
            +
                    "torch._C._distributed_fsdp.",
         
     | 
| 
      
 2125 
     | 
    
         
            +
                    "torch.distributed.optim.",
         
     | 
| 
      
 2126 
     | 
    
         
            +
                    # --- multiprocessing ---
         
     | 
| 
      
 2127 
     | 
    
         
            +
                    "multiprocessing.resource_sharer.",
         
     | 
| 
      
 2128 
     | 
    
         
            +
                    "multiprocessing.reduction.",
         
     | 
| 
      
 2129 
     | 
    
         
            +
                    "pickletools.",
         
     | 
| 
      
 2130 
     | 
    
         
            +
                    # --- PEFT / LoRA ---
         
     | 
| 
      
 2131 
     | 
    
         
            +
                    "peft.",
         
     | 
| 
      
 2132 
     | 
    
         
            +
                    "transformers.",
         
     | 
| 
      
 2133 
     | 
    
         
            +
                    "huggingface_hub.",
         
     | 
| 
      
 2134 
     | 
    
         
            +
                    # --- SGLang & Unitest ---
         
     | 
| 
      
 2135 
     | 
    
         
            +
                    "sglang.srt.weight_sync.tensor_bucket.",
         
     | 
| 
      
 2136 
     | 
    
         
            +
                    "sglang.srt.model_executor.model_runner.",
         
     | 
| 
      
 2137 
     | 
    
         
            +
                    "sglang.srt.layers.",
         
     | 
| 
      
 2138 
     | 
    
         
            +
                    "sglang.srt.utils.",
         
     | 
| 
      
 2139 
     | 
    
         
            +
                }
         
     | 
| 
      
 2140 
     | 
    
         
            +
             
     | 
| 
      
 2141 
     | 
    
         
            +
                DENY_CLASSES = {
         
     | 
| 
      
 2142 
     | 
    
         
            +
                    ("builtins", "eval"),
         
     | 
| 
      
 2143 
     | 
    
         
            +
                    ("builtins", "exec"),
         
     | 
| 
      
 2144 
     | 
    
         
            +
                    ("builtins", "compile"),
         
     | 
| 
      
 2145 
     | 
    
         
            +
                    ("os", "system"),
         
     | 
| 
      
 2146 
     | 
    
         
            +
                    ("subprocess", "Popen"),
         
     | 
| 
      
 2147 
     | 
    
         
            +
                    ("subprocess", "run"),
         
     | 
| 
      
 2148 
     | 
    
         
            +
                    ("codecs", "decode"),
         
     | 
| 
      
 2149 
     | 
    
         
            +
                    ("types", "CodeType"),
         
     | 
| 
      
 2150 
     | 
    
         
            +
                    ("types", "FunctionType"),
         
     | 
| 
      
 2151 
     | 
    
         
            +
                }
         
     | 
| 
      
 2152 
     | 
    
         
            +
             
     | 
| 
      
 2153 
     | 
    
         
            +
                def find_class(self, module, name):
         
     | 
| 
      
 2154 
     | 
    
         
            +
                    # Block deterministic attacks
         
     | 
| 
      
 2155 
     | 
    
         
            +
                    if (module, name) in self.DENY_CLASSES:
         
     | 
| 
      
 2156 
     | 
    
         
            +
                        raise RuntimeError(
         
     | 
| 
      
 2157 
     | 
    
         
            +
                            f"Blocked unsafe class loading ({module}.{name}), "
         
     | 
| 
      
 2158 
     | 
    
         
            +
                            f"to prevent exploitation of CVE-2025-10164"
         
     | 
| 
      
 2159 
     | 
    
         
            +
                        )
         
     | 
| 
      
 2160 
     | 
    
         
            +
                    # Allowlist of safe-to-load modules.
         
     | 
| 
      
 2161 
     | 
    
         
            +
                    if any(
         
     | 
| 
      
 2162 
     | 
    
         
            +
                        (module + ".").startswith(prefix) for prefix in self.ALLOWED_MODULE_PREFIXES
         
     | 
| 
      
 2163 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 2164 
     | 
    
         
            +
                        return super().find_class(module, name)
         
     | 
| 
      
 2165 
     | 
    
         
            +
             
     | 
| 
      
 2166 
     | 
    
         
            +
                    # Block everything else. (Potential attack surface)
         
     | 
| 
      
 2167 
     | 
    
         
            +
                    raise RuntimeError(
         
     | 
| 
      
 2168 
     | 
    
         
            +
                        f"Blocked unsafe class loading ({module}.{name}), "
         
     | 
| 
      
 2169 
     | 
    
         
            +
                        f"to prevent exploitation of CVE-2025-10164"
         
     | 
| 
      
 2170 
     | 
    
         
            +
                    )
         
     | 
| 
       2016 
2171 
     | 
    
         | 
| 
       2017 
2172 
     | 
    
         | 
| 
       2018 
2173 
     | 
    
         
             
            def debug_timing(func):
         
     | 
| 
         @@ -2164,6 +2319,11 @@ def launch_dummy_health_check_server(host, port, enable_metrics): 
     | 
|
| 
       2164 
2319 
     | 
    
         | 
| 
       2165 
2320 
     | 
    
         
             
                app = FastAPI()
         
     | 
| 
       2166 
2321 
     | 
    
         | 
| 
      
 2322 
     | 
    
         
            +
                @app.get("/ping")
         
     | 
| 
      
 2323 
     | 
    
         
            +
                async def ping():
         
     | 
| 
      
 2324 
     | 
    
         
            +
                    """Could be used by the checkpoint-engine update script to confirm the server is up."""
         
     | 
| 
      
 2325 
     | 
    
         
            +
                    return Response(status_code=200)
         
     | 
| 
      
 2326 
     | 
    
         
            +
             
     | 
| 
       2167 
2327 
     | 
    
         
             
                @app.get("/health")
         
     | 
| 
       2168 
2328 
     | 
    
         
             
                async def health():
         
     | 
| 
       2169 
2329 
     | 
    
         
             
                    """Check the health of the http server."""
         
     | 
| 
         @@ -2286,6 +2446,8 @@ def retry( 
     | 
|
| 
       2286 
2446 
     | 
    
         
             
                    try:
         
     | 
| 
       2287 
2447 
     | 
    
         
             
                        return fn()
         
     | 
| 
       2288 
2448 
     | 
    
         
             
                    except Exception as e:
         
     | 
| 
      
 2449 
     | 
    
         
            +
                        traceback.print_exc()
         
     | 
| 
      
 2450 
     | 
    
         
            +
             
     | 
| 
       2289 
2451 
     | 
    
         
             
                        if try_index >= max_retry:
         
     | 
| 
       2290 
2452 
     | 
    
         
             
                            raise Exception(f"retry() exceed maximum number of retries.")
         
     | 
| 
       2291 
2453 
     | 
    
         | 
| 
         @@ -2299,11 +2461,30 @@ def retry( 
     | 
|
| 
       2299 
2461 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       2300 
2462 
     | 
    
         
             
                            f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
         
     | 
| 
       2301 
2463 
     | 
    
         
             
                        )
         
     | 
| 
       2302 
     | 
    
         
            -
                        traceback.print_exc()
         
     | 
| 
       2303 
2464 
     | 
    
         | 
| 
       2304 
2465 
     | 
    
         
             
                        time.sleep(delay)
         
     | 
| 
       2305 
2466 
     | 
    
         | 
| 
       2306 
2467 
     | 
    
         | 
| 
      
 2468 
     | 
    
         
            +
            def has_hf_quant_config(model_path: str) -> bool:
         
     | 
| 
      
 2469 
     | 
    
         
            +
                """Check if the model path contains hf_quant_config.json file.
         
     | 
| 
      
 2470 
     | 
    
         
            +
             
     | 
| 
      
 2471 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 2472 
     | 
    
         
            +
                    model_path: Path to the model, can be local path or remote URL.
         
     | 
| 
      
 2473 
     | 
    
         
            +
             
     | 
| 
      
 2474 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 2475 
     | 
    
         
            +
                    True if hf_quant_config.json exists, False otherwise.
         
     | 
| 
      
 2476 
     | 
    
         
            +
                """
         
     | 
| 
      
 2477 
     | 
    
         
            +
                if os.path.exists(os.path.join(model_path, "hf_quant_config.json")):
         
     | 
| 
      
 2478 
     | 
    
         
            +
                    return True
         
     | 
| 
      
 2479 
     | 
    
         
            +
                try:
         
     | 
| 
      
 2480 
     | 
    
         
            +
                    from huggingface_hub import HfApi
         
     | 
| 
      
 2481 
     | 
    
         
            +
             
     | 
| 
      
 2482 
     | 
    
         
            +
                    hf_api = HfApi()
         
     | 
| 
      
 2483 
     | 
    
         
            +
                    return hf_api.file_exists(model_path, "hf_quant_config.json")
         
     | 
| 
      
 2484 
     | 
    
         
            +
                except Exception:
         
     | 
| 
      
 2485 
     | 
    
         
            +
                    return False
         
     | 
| 
      
 2486 
     | 
    
         
            +
             
     | 
| 
      
 2487 
     | 
    
         
            +
             
     | 
| 
       2307 
2488 
     | 
    
         
             
            def flatten_nested_list(nested_list):
         
     | 
| 
       2308 
2489 
     | 
    
         
             
                if isinstance(nested_list, list):
         
     | 
| 
       2309 
2490 
     | 
    
         
             
                    return [
         
     | 
| 
         @@ -2439,17 +2620,12 @@ def get_local_ip_auto(fallback: str = None) -> str: 
     | 
|
| 
       2439 
2620 
     | 
    
         
             
                raise ValueError("Can not get local ip")
         
     | 
| 
       2440 
2621 
     | 
    
         | 
| 
       2441 
2622 
     | 
    
         | 
| 
       2442 
     | 
    
         
            -
            def is_page_size_one(server_args):
         
     | 
| 
       2443 
     | 
    
         
            -
                return server_args.page_size == 1
         
     | 
| 
       2444 
     | 
    
         
            -
             
     | 
| 
       2445 
     | 
    
         
            -
             
     | 
| 
       2446 
2623 
     | 
    
         
             
            # TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
         
     | 
| 
       2447 
2624 
     | 
    
         
             
            # TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
         
     | 
| 
       2448 
2625 
     | 
    
         
             
            def is_no_spec_infer_or_topk_one(server_args):
         
     | 
| 
       2449 
2626 
     | 
    
         
             
                return server_args.speculative_eagle_topk is None or (
         
     | 
| 
       2450 
     | 
    
         
            -
                    server_args.speculative_eagle_topk  
     | 
| 
       2451 
     | 
    
         
            -
                    and server_args. 
     | 
| 
       2452 
     | 
    
         
            -
                    and is_page_size_one(server_args)
         
     | 
| 
      
 2627 
     | 
    
         
            +
                    server_args.speculative_eagle_topk == 1
         
     | 
| 
      
 2628 
     | 
    
         
            +
                    and (server_args.page_size == 1 or server_args.page_size is None)
         
     | 
| 
       2453 
2629 
     | 
    
         
             
                )
         
     | 
| 
       2454 
2630 
     | 
    
         | 
| 
       2455 
2631 
     | 
    
         | 
| 
         @@ -2461,6 +2637,7 @@ def is_fa3_default_architecture(hf_config): 
     | 
|
| 
       2461 
2637 
     | 
    
         
             
                    "Qwen2ForCausalLM",
         
     | 
| 
       2462 
2638 
     | 
    
         
             
                    "Llama4ForConditionalGeneration",
         
     | 
| 
       2463 
2639 
     | 
    
         
             
                    "LlamaForCausalLM",
         
     | 
| 
      
 2640 
     | 
    
         
            +
                    "Olmo2ForCausalLM",
         
     | 
| 
       2464 
2641 
     | 
    
         
             
                    "Gemma2ForCausalLM",
         
     | 
| 
       2465 
2642 
     | 
    
         
             
                    "Gemma3ForConditionalGeneration",
         
     | 
| 
       2466 
2643 
     | 
    
         
             
                    "Qwen3ForCausalLM",
         
     | 
| 
         @@ -2494,9 +2671,9 @@ def log_info_on_rank0(logger, msg): 
     | 
|
| 
       2494 
2671 
     | 
    
         | 
| 
       2495 
2672 
     | 
    
         
             
            def load_json_config(data: str):
         
     | 
| 
       2496 
2673 
     | 
    
         
             
                try:
         
     | 
| 
       2497 
     | 
    
         
            -
                    return  
     | 
| 
      
 2674 
     | 
    
         
            +
                    return orjson.loads(data)
         
     | 
| 
       2498 
2675 
     | 
    
         
             
                except JSONDecodeError:
         
     | 
| 
       2499 
     | 
    
         
            -
                    return  
     | 
| 
      
 2676 
     | 
    
         
            +
                    return orjson.loads(Path(data).read_text())
         
     | 
| 
       2500 
2677 
     | 
    
         | 
| 
       2501 
2678 
     | 
    
         | 
| 
       2502 
2679 
     | 
    
         
             
            def dispose_tensor(x: torch.Tensor):
         
     | 
| 
         @@ -2863,7 +3040,7 @@ def get_cpu_ids_by_node(): 
     | 
|
| 
       2863 
3040 
     | 
    
         
             
            def is_shm_available(dtype, world_size, local_size):
         
     | 
| 
       2864 
3041 
     | 
    
         
             
                return (
         
     | 
| 
       2865 
3042 
     | 
    
         
             
                    cpu_has_amx_support()
         
     | 
| 
       2866 
     | 
    
         
            -
                    and dtype in [torch.bfloat16, torch.float]
         
     | 
| 
      
 3043 
     | 
    
         
            +
                    and dtype in [torch.bfloat16, torch.float16, torch.float]
         
     | 
| 
       2867 
3044 
     | 
    
         
             
                    and world_size >= 1
         
     | 
| 
       2868 
3045 
     | 
    
         
             
                    and world_size == local_size
         
     | 
| 
       2869 
3046 
     | 
    
         
             
                )
         
     | 
| 
         @@ -2914,10 +3091,6 @@ def lru_cache_frozenset(maxsize=128): 
     | 
|
| 
       2914 
3091 
     | 
    
         
             
                return decorator
         
     | 
| 
       2915 
3092 
     | 
    
         | 
| 
       2916 
3093 
     | 
    
         | 
| 
       2917 
     | 
    
         
            -
            def get_origin_rid(rid):
         
     | 
| 
       2918 
     | 
    
         
            -
                return rid.split("_", 1)[1] if "_" in rid else rid
         
     | 
| 
       2919 
     | 
    
         
            -
             
     | 
| 
       2920 
     | 
    
         
            -
             
     | 
| 
       2921 
3094 
     | 
    
         
             
            def apply_module_patch(target_module, target_function, wrappers):
         
     | 
| 
       2922 
3095 
     | 
    
         
             
                original_module, original_function = parse_module_path(
         
     | 
| 
       2923 
3096 
     | 
    
         
             
                    target_module, target_function, False
         
     | 
| 
         @@ -3205,7 +3378,7 @@ def numa_bind_to_node(node: int): 
     | 
|
| 
       3205 
3378 
     | 
    
         | 
| 
       3206 
3379 
     | 
    
         
             
            def json_list_type(value):
         
     | 
| 
       3207 
3380 
     | 
    
         
             
                try:
         
     | 
| 
       3208 
     | 
    
         
            -
                    return  
     | 
| 
      
 3381 
     | 
    
         
            +
                    return orjson.loads(value)
         
     | 
| 
       3209 
3382 
     | 
    
         
             
                except json.JSONDecodeError:
         
     | 
| 
       3210 
3383 
     | 
    
         
             
                    raise argparse.ArgumentTypeError(
         
     | 
| 
       3211 
3384 
     | 
    
         
             
                        f"Invalid JSON list: {value}. Please provide a valid JSON list."
         
     | 
| 
         @@ -3213,7 +3386,12 @@ def json_list_type(value): 
     | 
|
| 
       3213 
3386 
     | 
    
         | 
| 
       3214 
3387 
     | 
    
         | 
| 
       3215 
3388 
     | 
    
         
             
            @contextmanager
         
     | 
| 
       3216 
     | 
    
         
            -
            def  
     | 
| 
      
 3389 
     | 
    
         
            +
            def maybe_reindex_device_id(gpu_id: int):
         
     | 
| 
      
 3390 
     | 
    
         
            +
             
     | 
| 
      
 3391 
     | 
    
         
            +
                if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() is False or not is_cuda_alike():
         
     | 
| 
      
 3392 
     | 
    
         
            +
                    yield gpu_id
         
     | 
| 
      
 3393 
     | 
    
         
            +
                    return
         
     | 
| 
      
 3394 
     | 
    
         
            +
             
     | 
| 
       3217 
3395 
     | 
    
         
             
                original_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
         
     | 
| 
       3218 
3396 
     | 
    
         
             
                if original_cuda_visible_devices:
         
     | 
| 
       3219 
3397 
     | 
    
         
             
                    cuda_visible_devices = original_cuda_visible_devices.split(",")
         
     | 
| 
         @@ -3222,7 +3400,11 @@ def temp_set_cuda_visible_devices(gpu_id: int): 
     | 
|
| 
       3222 
3400 
     | 
    
         | 
| 
       3223 
3401 
     | 
    
         
             
                str_gpu_id = cuda_visible_devices[gpu_id] if cuda_visible_devices else str(gpu_id)
         
     | 
| 
       3224 
3402 
     | 
    
         
             
                os.environ["CUDA_VISIBLE_DEVICES"] = str_gpu_id
         
     | 
| 
       3225 
     | 
    
         
            -
             
     | 
| 
      
 3403 
     | 
    
         
            +
             
     | 
| 
      
 3404 
     | 
    
         
            +
                logger.debug(f"Set CUDA_VISIBLE_DEVICES to {str_gpu_id}")
         
     | 
| 
      
 3405 
     | 
    
         
            +
             
     | 
| 
      
 3406 
     | 
    
         
            +
                yield 0
         
     | 
| 
      
 3407 
     | 
    
         
            +
             
     | 
| 
       3226 
3408 
     | 
    
         
             
                if original_cuda_visible_devices:
         
     | 
| 
       3227 
3409 
     | 
    
         
             
                    os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible_devices
         
     | 
| 
       3228 
3410 
     | 
    
         
             
                else:
         
     | 
| 
         @@ -3383,3 +3565,11 @@ def cached_triton_kernel(key_fn=None): 
     | 
|
| 
       3383 
3565 
     | 
    
         
             
                    return CachedKernel(fn, key_fn)
         
     | 
| 
       3384 
3566 
     | 
    
         | 
| 
       3385 
3567 
     | 
    
         
             
                return decorator
         
     | 
| 
      
 3568 
     | 
    
         
            +
             
     | 
| 
      
 3569 
     | 
    
         
            +
             
     | 
| 
      
 3570 
     | 
    
         
            +
            # Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
         
     | 
| 
      
 3571 
     | 
    
         
            +
            def calc_diff(x, y):
         
     | 
| 
      
 3572 
     | 
    
         
            +
                x, y = x.double(), y.double()
         
     | 
| 
      
 3573 
     | 
    
         
            +
                denominator = (x * x + y * y).sum()
         
     | 
| 
      
 3574 
     | 
    
         
            +
                sim = 2 * (x * y).sum() / denominator
         
     | 
| 
      
 3575 
     | 
    
         
            +
                return 1 - sim
         
     |