sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/utils/common.py
CHANGED
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
# ==============================================================================
|
|
14
14
|
"""Common utilities."""
|
|
15
|
-
|
|
16
15
|
from __future__ import annotations
|
|
17
16
|
|
|
18
17
|
import argparse
|
|
@@ -43,6 +42,7 @@ import tempfile
|
|
|
43
42
|
import threading
|
|
44
43
|
import time
|
|
45
44
|
import traceback
|
|
45
|
+
import types
|
|
46
46
|
import uuid
|
|
47
47
|
import warnings
|
|
48
48
|
from collections import OrderedDict, defaultdict
|
|
@@ -56,6 +56,7 @@ from json import JSONDecodeError
|
|
|
56
56
|
from multiprocessing.reduction import ForkingPickler
|
|
57
57
|
from pathlib import Path
|
|
58
58
|
from typing import (
|
|
59
|
+
TYPE_CHECKING,
|
|
59
60
|
Any,
|
|
60
61
|
Callable,
|
|
61
62
|
Dict,
|
|
@@ -63,6 +64,7 @@ from typing import (
|
|
|
63
64
|
List,
|
|
64
65
|
Optional,
|
|
65
66
|
Protocol,
|
|
67
|
+
Sequence,
|
|
66
68
|
Set,
|
|
67
69
|
Tuple,
|
|
68
70
|
TypeVar,
|
|
@@ -70,6 +72,7 @@ from typing import (
|
|
|
70
72
|
)
|
|
71
73
|
|
|
72
74
|
import numpy as np
|
|
75
|
+
import orjson
|
|
73
76
|
import psutil
|
|
74
77
|
import pybase64
|
|
75
78
|
import requests
|
|
@@ -88,8 +91,12 @@ from torch.profiler import ProfilerActivity, profile, record_function
|
|
|
88
91
|
from torch.utils._contextlib import _DecoratorContextManager
|
|
89
92
|
from typing_extensions import Literal
|
|
90
93
|
|
|
94
|
+
from sglang.srt.environ import envs
|
|
91
95
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
|
92
96
|
|
|
97
|
+
if TYPE_CHECKING:
|
|
98
|
+
from sglang.srt.layers.quantization.base_config import QuantizeMethodBase
|
|
99
|
+
|
|
93
100
|
logger = logging.getLogger(__name__)
|
|
94
101
|
|
|
95
102
|
show_time_cost = False
|
|
@@ -162,6 +169,20 @@ def _check(cc_major):
|
|
|
162
169
|
) >= (12, 3)
|
|
163
170
|
|
|
164
171
|
|
|
172
|
+
@contextmanager
|
|
173
|
+
def device_context(device: torch.device):
|
|
174
|
+
if device.type == "cpu" and is_cpu():
|
|
175
|
+
with torch.device("cpu"):
|
|
176
|
+
yield
|
|
177
|
+
else:
|
|
178
|
+
module = torch.get_device_module(device)
|
|
179
|
+
if module is not None:
|
|
180
|
+
with module.device(device.index):
|
|
181
|
+
yield
|
|
182
|
+
else:
|
|
183
|
+
raise ValueError(f"Unknown device module: {device}")
|
|
184
|
+
|
|
185
|
+
|
|
165
186
|
is_ampere_with_cuda_12_3 = lambda: _check(8)
|
|
166
187
|
is_hopper_with_cuda_12_3 = lambda: _check(9)
|
|
167
188
|
|
|
@@ -173,6 +194,15 @@ def is_blackwell():
|
|
|
173
194
|
return torch.cuda.get_device_capability()[0] == 10
|
|
174
195
|
|
|
175
196
|
|
|
197
|
+
@lru_cache(maxsize=1)
|
|
198
|
+
def is_sm120_supported(device=None) -> bool:
|
|
199
|
+
if not is_cuda_alike():
|
|
200
|
+
return False
|
|
201
|
+
return (torch.cuda.get_device_capability(device)[0] == 12) and (
|
|
202
|
+
torch.version.cuda >= "12.8"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
|
|
176
206
|
@lru_cache(maxsize=1)
|
|
177
207
|
def is_sm100_supported(device=None) -> bool:
|
|
178
208
|
if not is_cuda_alike():
|
|
@@ -228,7 +258,7 @@ def support_triton(backend: str) -> bool:
|
|
|
228
258
|
|
|
229
259
|
|
|
230
260
|
try:
|
|
231
|
-
import sgl_kernel
|
|
261
|
+
import sgl_kernel # noqa: F401
|
|
232
262
|
|
|
233
263
|
is_intel_amx_backend_available = hasattr(
|
|
234
264
|
torch.ops.sgl_kernel, "convert_weight_packed"
|
|
@@ -253,6 +283,14 @@ def use_intel_amx_backend(layer):
|
|
|
253
283
|
return getattr(layer, "use_intel_amx_backend", False)
|
|
254
284
|
|
|
255
285
|
|
|
286
|
+
def xpu_has_xmx_support():
|
|
287
|
+
# TODO: update with XPU capalibity query
|
|
288
|
+
if is_xpu():
|
|
289
|
+
# currently only PVC/LNL/BMG supports F64, so we only support these now
|
|
290
|
+
return torch.xpu.get_device_properties().has_fp64
|
|
291
|
+
return False
|
|
292
|
+
|
|
293
|
+
|
|
256
294
|
def is_flashinfer_available():
|
|
257
295
|
"""
|
|
258
296
|
Check whether flashinfer is available.
|
|
@@ -263,6 +301,17 @@ def is_flashinfer_available():
|
|
|
263
301
|
return importlib.util.find_spec("flashinfer") is not None and is_cuda()
|
|
264
302
|
|
|
265
303
|
|
|
304
|
+
def is_nvidia_cublas_cu12_version_ge_12_9():
|
|
305
|
+
"""
|
|
306
|
+
temporary fix for issue #11272
|
|
307
|
+
"""
|
|
308
|
+
try:
|
|
309
|
+
installed_version = version("nvidia-cublas-cu12")
|
|
310
|
+
except PackageNotFoundError:
|
|
311
|
+
return False
|
|
312
|
+
return pkg_version.parse(installed_version) >= pkg_version.parse("12.9")
|
|
313
|
+
|
|
314
|
+
|
|
266
315
|
def random_uuid() -> str:
|
|
267
316
|
return str(uuid.uuid4().hex)
|
|
268
317
|
|
|
@@ -409,7 +458,15 @@ def get_available_gpu_memory(
|
|
|
409
458
|
|
|
410
459
|
if empty_cache:
|
|
411
460
|
torch.cuda.empty_cache()
|
|
412
|
-
|
|
461
|
+
SHARED_SYSMEM_DEVICE_MEM_SMS = (87, 110, 121) # Orin, Thor, Spark
|
|
462
|
+
if get_device_sm() in SHARED_SYSMEM_DEVICE_MEM_SMS:
|
|
463
|
+
# On these devices, which use sysmem as device mem, torch.cuda.mem_get_info()
|
|
464
|
+
# only reports "free" memory, which can be lower than what is actually
|
|
465
|
+
# available due to not including cache memory. So we use the system available
|
|
466
|
+
# memory metric instead.
|
|
467
|
+
free_gpu_memory = psutil.virtual_memory().available
|
|
468
|
+
else:
|
|
469
|
+
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
|
|
413
470
|
|
|
414
471
|
elif device == "xpu":
|
|
415
472
|
num_gpus = torch.xpu.device_count()
|
|
@@ -453,6 +510,8 @@ def get_available_gpu_memory(
|
|
|
453
510
|
f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
|
|
454
511
|
"which may cause useless memory allocation for torch NPU context.",
|
|
455
512
|
)
|
|
513
|
+
if empty_cache:
|
|
514
|
+
torch.npu.empty_cache()
|
|
456
515
|
free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
|
|
457
516
|
|
|
458
517
|
if distributed:
|
|
@@ -481,13 +540,13 @@ def make_layers(
|
|
|
481
540
|
pp_size: Optional[int] = None,
|
|
482
541
|
prefix: str = "",
|
|
483
542
|
return_tuple: bool = False,
|
|
484
|
-
offloader_kwargs: Dict[str, Any] =
|
|
543
|
+
offloader_kwargs: Optional[Dict[str, Any]] = None,
|
|
485
544
|
) -> Tuple[torch.nn.Module, int, int]:
|
|
486
545
|
"""Make a list of layers with the given layer function"""
|
|
487
546
|
# circula imports
|
|
488
547
|
from sglang.srt.distributed import get_pp_indices
|
|
489
548
|
from sglang.srt.layers.utils import PPMissingLayer
|
|
490
|
-
from sglang.srt.offloader import get_offloader
|
|
549
|
+
from sglang.srt.utils.offloader import get_offloader
|
|
491
550
|
|
|
492
551
|
assert not pp_size or num_hidden_layers >= pp_size
|
|
493
552
|
start_layer, end_layer = (
|
|
@@ -506,7 +565,7 @@ def make_layers(
|
|
|
506
565
|
layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
|
|
507
566
|
for idx in range(start_layer, end_layer)
|
|
508
567
|
),
|
|
509
|
-
**offloader_kwargs,
|
|
568
|
+
**(offloader_kwargs or {}),
|
|
510
569
|
)
|
|
511
570
|
+ [
|
|
512
571
|
PPMissingLayer(return_tuple=return_tuple)
|
|
@@ -518,6 +577,24 @@ def make_layers(
|
|
|
518
577
|
return modules, start_layer, end_layer
|
|
519
578
|
|
|
520
579
|
|
|
580
|
+
def make_layers_non_pp(
|
|
581
|
+
num_hidden_layers: int,
|
|
582
|
+
layer_fn: LayerFn,
|
|
583
|
+
prefix: str = "",
|
|
584
|
+
) -> torch.nn.ModuleList:
|
|
585
|
+
from sglang.srt.utils.offloader import get_offloader
|
|
586
|
+
|
|
587
|
+
layers = torch.nn.ModuleList(
|
|
588
|
+
get_offloader().wrap_modules(
|
|
589
|
+
(
|
|
590
|
+
layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
|
|
591
|
+
for idx in range(num_hidden_layers)
|
|
592
|
+
)
|
|
593
|
+
)
|
|
594
|
+
)
|
|
595
|
+
return layers
|
|
596
|
+
|
|
597
|
+
|
|
521
598
|
cmo_stream = None
|
|
522
599
|
|
|
523
600
|
|
|
@@ -811,9 +888,9 @@ def get_image_bytes(image_file: Union[str, bytes]):
|
|
|
811
888
|
return f.read()
|
|
812
889
|
elif image_file.startswith("data:"):
|
|
813
890
|
image_file = image_file.split(",")[1]
|
|
814
|
-
return pybase64.b64decode(image_file)
|
|
891
|
+
return pybase64.b64decode(image_file, validate=True)
|
|
815
892
|
elif isinstance(image_file, str):
|
|
816
|
-
return pybase64.b64decode(image_file)
|
|
893
|
+
return pybase64.b64decode(image_file, validate=True)
|
|
817
894
|
else:
|
|
818
895
|
raise NotImplementedError(f"Invalid image: {image_file}")
|
|
819
896
|
|
|
@@ -850,7 +927,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
|
|
|
850
927
|
vr = VideoReader(tmp_file.name, ctx=ctx)
|
|
851
928
|
elif video_file.startswith("data:"):
|
|
852
929
|
_, encoded = video_file.split(",", 1)
|
|
853
|
-
video_bytes = pybase64.b64decode(encoded)
|
|
930
|
+
video_bytes = pybase64.b64decode(encoded, validate=True)
|
|
854
931
|
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
|
855
932
|
tmp_file.write(video_bytes)
|
|
856
933
|
tmp_file.close()
|
|
@@ -858,7 +935,7 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
|
|
|
858
935
|
elif os.path.isfile(video_file):
|
|
859
936
|
vr = VideoReader(video_file, ctx=ctx)
|
|
860
937
|
else:
|
|
861
|
-
video_bytes = pybase64.b64decode(video_file)
|
|
938
|
+
video_bytes = pybase64.b64decode(video_file, validate=True)
|
|
862
939
|
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
|
|
863
940
|
tmp_file.write(video_bytes)
|
|
864
941
|
tmp_file.close()
|
|
@@ -1007,7 +1084,7 @@ def monkey_patch_vllm_gguf_config():
|
|
|
1007
1084
|
|
|
1008
1085
|
def get_quant_method_with_embedding_replaced(
|
|
1009
1086
|
self, layer: torch.nn.Module, prefix: str
|
|
1010
|
-
) -> Optional[
|
|
1087
|
+
) -> Optional[QuantizeMethodBase]:
|
|
1011
1088
|
if isinstance(layer, LinearBase):
|
|
1012
1089
|
return GGUFLinearMethod(self)
|
|
1013
1090
|
elif isinstance(layer, VocabParallelEmbedding):
|
|
@@ -1083,7 +1160,7 @@ def configure_logger(server_args, prefix: str = ""):
|
|
|
1083
1160
|
f"{SGLANG_LOGGING_CONFIG_PATH} but it does not exist!"
|
|
1084
1161
|
)
|
|
1085
1162
|
with open(SGLANG_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
|
|
1086
|
-
custom_config =
|
|
1163
|
+
custom_config = orjson.loads(file.read())
|
|
1087
1164
|
logging.config.dictConfig(custom_config)
|
|
1088
1165
|
return
|
|
1089
1166
|
format = f"[%(asctime)s{prefix}] %(message)s"
|
|
@@ -1262,8 +1339,46 @@ def pytorch_profile(name, func, *args, data_size=-1):
|
|
|
1262
1339
|
|
|
1263
1340
|
|
|
1264
1341
|
def get_zmq_socket(
|
|
1265
|
-
context: zmq.Context,
|
|
1266
|
-
|
|
1342
|
+
context: zmq.Context,
|
|
1343
|
+
socket_type: zmq.SocketType,
|
|
1344
|
+
endpoint: Optional[str] = None,
|
|
1345
|
+
bind: bool = True,
|
|
1346
|
+
) -> Union[zmq.Socket, Tuple[int, zmq.Socket]]:
|
|
1347
|
+
"""Create and configure a ZeroMQ socket.
|
|
1348
|
+
|
|
1349
|
+
Args:
|
|
1350
|
+
context: ZeroMQ context to create the socket from.
|
|
1351
|
+
socket_type: Type of ZeroMQ socket to create.
|
|
1352
|
+
endpoint: Optional endpoint to bind/connect to. If None, binds to a random TCP port.
|
|
1353
|
+
bind: Whether to bind (True) or connect (False) to the endpoint. Ignored if endpoint is None.
|
|
1354
|
+
|
|
1355
|
+
Returns:
|
|
1356
|
+
If endpoint is None: Tuple of (port, socket) where port is the randomly assigned TCP port.
|
|
1357
|
+
If endpoint is provided: The configured ZeroMQ socket.
|
|
1358
|
+
"""
|
|
1359
|
+
socket = context.socket(socket_type)
|
|
1360
|
+
|
|
1361
|
+
if endpoint is None:
|
|
1362
|
+
# Bind to random TCP port
|
|
1363
|
+
config_socket(socket, socket_type)
|
|
1364
|
+
port = socket.bind_to_random_port("tcp://*")
|
|
1365
|
+
return port, socket
|
|
1366
|
+
else:
|
|
1367
|
+
# Handle IPv6 if endpoint contains brackets
|
|
1368
|
+
if endpoint.find("[") != -1:
|
|
1369
|
+
socket.setsockopt(zmq.IPV6, 1)
|
|
1370
|
+
|
|
1371
|
+
config_socket(socket, socket_type)
|
|
1372
|
+
|
|
1373
|
+
if bind:
|
|
1374
|
+
socket.bind(endpoint)
|
|
1375
|
+
else:
|
|
1376
|
+
socket.connect(endpoint)
|
|
1377
|
+
|
|
1378
|
+
return socket
|
|
1379
|
+
|
|
1380
|
+
|
|
1381
|
+
def config_socket(socket, socket_type: zmq.SocketType):
|
|
1267
1382
|
mem = psutil.virtual_memory()
|
|
1268
1383
|
total_mem = mem.total / 1024**3
|
|
1269
1384
|
available_mem = mem.available / 1024**3
|
|
@@ -1272,10 +1387,6 @@ def get_zmq_socket(
|
|
|
1272
1387
|
else:
|
|
1273
1388
|
buf_size = -1
|
|
1274
1389
|
|
|
1275
|
-
socket = context.socket(socket_type)
|
|
1276
|
-
if endpoint.find("[") != -1:
|
|
1277
|
-
socket.setsockopt(zmq.IPV6, 1)
|
|
1278
|
-
|
|
1279
1390
|
def set_send_opt():
|
|
1280
1391
|
socket.setsockopt(zmq.SNDHWM, 0)
|
|
1281
1392
|
socket.setsockopt(zmq.SNDBUF, buf_size)
|
|
@@ -1288,19 +1399,12 @@ def get_zmq_socket(
|
|
|
1288
1399
|
set_send_opt()
|
|
1289
1400
|
elif socket_type == zmq.PULL:
|
|
1290
1401
|
set_recv_opt()
|
|
1291
|
-
elif socket_type
|
|
1402
|
+
elif socket_type in [zmq.DEALER, zmq.REQ, zmq.REP]:
|
|
1292
1403
|
set_send_opt()
|
|
1293
1404
|
set_recv_opt()
|
|
1294
1405
|
else:
|
|
1295
1406
|
raise ValueError(f"Unsupported socket type: {socket_type}")
|
|
1296
1407
|
|
|
1297
|
-
if bind:
|
|
1298
|
-
socket.bind(endpoint)
|
|
1299
|
-
else:
|
|
1300
|
-
socket.connect(endpoint)
|
|
1301
|
-
|
|
1302
|
-
return socket
|
|
1303
|
-
|
|
1304
1408
|
|
|
1305
1409
|
def dump_to_file(dirpath, name, value):
|
|
1306
1410
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
|
@@ -1500,7 +1604,7 @@ def get_hpu_memory_capacity():
|
|
|
1500
1604
|
|
|
1501
1605
|
def get_npu_memory_capacity():
|
|
1502
1606
|
try:
|
|
1503
|
-
import torch_npu
|
|
1607
|
+
import torch_npu # noqa: F401
|
|
1504
1608
|
|
|
1505
1609
|
return torch.npu.mem_get_info()[1] // 1024 // 1024 # unit: MB
|
|
1506
1610
|
except ImportError as e:
|
|
@@ -1521,13 +1625,18 @@ def get_cpu_memory_capacity():
|
|
|
1521
1625
|
for numa_id in range(n_numa_node):
|
|
1522
1626
|
file_meminfo = f"node{numa_id}/meminfo"
|
|
1523
1627
|
with open(os.path.join(file_prefix, file_meminfo), "r") as f:
|
|
1524
|
-
# 1st line
|
|
1525
|
-
line = f.
|
|
1526
|
-
|
|
1628
|
+
# MemTotal info is at the 1st line
|
|
1629
|
+
line = f.readline()
|
|
1630
|
+
# Expected format: "Node 0 MemTotal: 100000000 kB"
|
|
1631
|
+
parts = line.split()
|
|
1632
|
+
if len(parts) >= 4 and parts[2] == "MemTotal:":
|
|
1633
|
+
numa_mem_list.append(int(parts[3]))
|
|
1634
|
+
else:
|
|
1635
|
+
raise ValueError(f"Unexpected format in {file_meminfo}: {line}")
|
|
1527
1636
|
# Retrieved value in KB, need MB
|
|
1528
1637
|
numa_mem = float(min(numa_mem_list) // 1024)
|
|
1529
1638
|
return numa_mem
|
|
1530
|
-
except FileNotFoundError:
|
|
1639
|
+
except (FileNotFoundError, ValueError, IndexError):
|
|
1531
1640
|
numa_mem = psutil.virtual_memory().total / n_numa_node
|
|
1532
1641
|
# Retrieved value in Byte, need MB
|
|
1533
1642
|
return float(numa_mem // (1 << 20))
|
|
@@ -1687,7 +1796,7 @@ def get_device(device_id: Optional[int] = None) -> str:
|
|
|
1687
1796
|
|
|
1688
1797
|
if is_habana_available():
|
|
1689
1798
|
try:
|
|
1690
|
-
import habana_frameworks.torch.hpu
|
|
1799
|
+
import habana_frameworks.torch.hpu # noqa: F401
|
|
1691
1800
|
|
|
1692
1801
|
if torch.hpu.is_available():
|
|
1693
1802
|
if device_id == None:
|
|
@@ -1717,7 +1826,7 @@ def get_device_count() -> int:
|
|
|
1717
1826
|
|
|
1718
1827
|
if is_habana_available():
|
|
1719
1828
|
try:
|
|
1720
|
-
import habana_frameworks.torch.hpu
|
|
1829
|
+
import habana_frameworks.torch.hpu # noqa: F401
|
|
1721
1830
|
|
|
1722
1831
|
if torch.hpu.is_available():
|
|
1723
1832
|
return torch.hpu.device_count()
|
|
@@ -1860,7 +1969,9 @@ def direct_register_custom_op(
|
|
|
1860
1969
|
if fake_impl is not None:
|
|
1861
1970
|
my_lib._register_fake(op_name, fake_impl)
|
|
1862
1971
|
except RuntimeError as error:
|
|
1863
|
-
if "Tried to register an operator" in str(
|
|
1972
|
+
if "Tried to register an operator" in str(error) and "multiple times" in str(
|
|
1973
|
+
error
|
|
1974
|
+
):
|
|
1864
1975
|
# Silently ignore duplicate registration errors
|
|
1865
1976
|
# This can happen in multi-engine scenarios
|
|
1866
1977
|
pass
|
|
@@ -1873,6 +1984,7 @@ def direct_register_custom_op(
|
|
|
1873
1984
|
|
|
1874
1985
|
|
|
1875
1986
|
def set_gpu_proc_affinity(
|
|
1987
|
+
pp_size: int,
|
|
1876
1988
|
tp_size: int,
|
|
1877
1989
|
nnodes: int,
|
|
1878
1990
|
gpu_id: int,
|
|
@@ -1881,7 +1993,8 @@ def set_gpu_proc_affinity(
|
|
|
1881
1993
|
pid = os.getpid()
|
|
1882
1994
|
p = psutil.Process(pid)
|
|
1883
1995
|
|
|
1884
|
-
|
|
1996
|
+
nnodes_per_tp_group = max(nnodes // pp_size, 1)
|
|
1997
|
+
tp_size_per_node = tp_size // nnodes_per_tp_group
|
|
1885
1998
|
|
|
1886
1999
|
# total physical cores
|
|
1887
2000
|
total_pcores = psutil.cpu_count(logical=False)
|
|
@@ -1993,7 +2106,7 @@ class MultiprocessingSerializer:
|
|
|
1993
2106
|
|
|
1994
2107
|
if output_str:
|
|
1995
2108
|
# Convert bytes to base64-encoded string
|
|
1996
|
-
|
|
2109
|
+
pybase64.b64encode(output).decode("utf-8")
|
|
1997
2110
|
|
|
1998
2111
|
return output
|
|
1999
2112
|
|
|
@@ -2164,6 +2277,11 @@ def launch_dummy_health_check_server(host, port, enable_metrics):
|
|
|
2164
2277
|
|
|
2165
2278
|
app = FastAPI()
|
|
2166
2279
|
|
|
2280
|
+
@app.get("/ping")
|
|
2281
|
+
async def ping():
|
|
2282
|
+
"""Could be used by the checkpoint-engine update script to confirm the server is up."""
|
|
2283
|
+
return Response(status_code=200)
|
|
2284
|
+
|
|
2167
2285
|
@app.get("/health")
|
|
2168
2286
|
async def health():
|
|
2169
2287
|
"""Check the health of the http server."""
|
|
@@ -2286,6 +2404,8 @@ def retry(
|
|
|
2286
2404
|
try:
|
|
2287
2405
|
return fn()
|
|
2288
2406
|
except Exception as e:
|
|
2407
|
+
traceback.print_exc()
|
|
2408
|
+
|
|
2289
2409
|
if try_index >= max_retry:
|
|
2290
2410
|
raise Exception(f"retry() exceed maximum number of retries.")
|
|
2291
2411
|
|
|
@@ -2299,11 +2419,30 @@ def retry(
|
|
|
2299
2419
|
logger.warning(
|
|
2300
2420
|
f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
|
|
2301
2421
|
)
|
|
2302
|
-
traceback.print_exc()
|
|
2303
2422
|
|
|
2304
2423
|
time.sleep(delay)
|
|
2305
2424
|
|
|
2306
2425
|
|
|
2426
|
+
def has_hf_quant_config(model_path: str) -> bool:
|
|
2427
|
+
"""Check if the model path contains hf_quant_config.json file.
|
|
2428
|
+
|
|
2429
|
+
Args:
|
|
2430
|
+
model_path: Path to the model, can be local path or remote URL.
|
|
2431
|
+
|
|
2432
|
+
Returns:
|
|
2433
|
+
True if hf_quant_config.json exists, False otherwise.
|
|
2434
|
+
"""
|
|
2435
|
+
if os.path.exists(os.path.join(model_path, "hf_quant_config.json")):
|
|
2436
|
+
return True
|
|
2437
|
+
try:
|
|
2438
|
+
from huggingface_hub import HfApi
|
|
2439
|
+
|
|
2440
|
+
hf_api = HfApi()
|
|
2441
|
+
return hf_api.file_exists(model_path, "hf_quant_config.json")
|
|
2442
|
+
except Exception:
|
|
2443
|
+
return False
|
|
2444
|
+
|
|
2445
|
+
|
|
2307
2446
|
def flatten_nested_list(nested_list):
|
|
2308
2447
|
if isinstance(nested_list, list):
|
|
2309
2448
|
return [
|
|
@@ -2461,6 +2600,7 @@ def is_fa3_default_architecture(hf_config):
|
|
|
2461
2600
|
"Qwen2ForCausalLM",
|
|
2462
2601
|
"Llama4ForConditionalGeneration",
|
|
2463
2602
|
"LlamaForCausalLM",
|
|
2603
|
+
"Olmo2ForCausalLM",
|
|
2464
2604
|
"Gemma2ForCausalLM",
|
|
2465
2605
|
"Gemma3ForConditionalGeneration",
|
|
2466
2606
|
"Qwen3ForCausalLM",
|
|
@@ -2494,9 +2634,9 @@ def log_info_on_rank0(logger, msg):
|
|
|
2494
2634
|
|
|
2495
2635
|
def load_json_config(data: str):
|
|
2496
2636
|
try:
|
|
2497
|
-
return
|
|
2637
|
+
return orjson.loads(data)
|
|
2498
2638
|
except JSONDecodeError:
|
|
2499
|
-
return
|
|
2639
|
+
return orjson.loads(Path(data).read_text())
|
|
2500
2640
|
|
|
2501
2641
|
|
|
2502
2642
|
def dispose_tensor(x: torch.Tensor):
|
|
@@ -2863,7 +3003,7 @@ def get_cpu_ids_by_node():
|
|
|
2863
3003
|
def is_shm_available(dtype, world_size, local_size):
|
|
2864
3004
|
return (
|
|
2865
3005
|
cpu_has_amx_support()
|
|
2866
|
-
and dtype in [torch.bfloat16, torch.float]
|
|
3006
|
+
and dtype in [torch.bfloat16, torch.float16, torch.float]
|
|
2867
3007
|
and world_size >= 1
|
|
2868
3008
|
and world_size == local_size
|
|
2869
3009
|
)
|
|
@@ -2914,10 +3054,6 @@ def lru_cache_frozenset(maxsize=128):
|
|
|
2914
3054
|
return decorator
|
|
2915
3055
|
|
|
2916
3056
|
|
|
2917
|
-
def get_origin_rid(rid):
|
|
2918
|
-
return rid.split("_", 1)[1] if "_" in rid else rid
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
3057
|
def apply_module_patch(target_module, target_function, wrappers):
|
|
2922
3058
|
original_module, original_function = parse_module_path(
|
|
2923
3059
|
target_module, target_function, False
|
|
@@ -3205,7 +3341,7 @@ def numa_bind_to_node(node: int):
|
|
|
3205
3341
|
|
|
3206
3342
|
def json_list_type(value):
|
|
3207
3343
|
try:
|
|
3208
|
-
return
|
|
3344
|
+
return orjson.loads(value)
|
|
3209
3345
|
except json.JSONDecodeError:
|
|
3210
3346
|
raise argparse.ArgumentTypeError(
|
|
3211
3347
|
f"Invalid JSON list: {value}. Please provide a valid JSON list."
|
|
@@ -3213,7 +3349,12 @@ def json_list_type(value):
|
|
|
3213
3349
|
|
|
3214
3350
|
|
|
3215
3351
|
@contextmanager
|
|
3216
|
-
def
|
|
3352
|
+
def maybe_reindex_device_id(gpu_id: int):
|
|
3353
|
+
|
|
3354
|
+
if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() is False or not is_cuda_alike():
|
|
3355
|
+
yield gpu_id
|
|
3356
|
+
return
|
|
3357
|
+
|
|
3217
3358
|
original_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
|
|
3218
3359
|
if original_cuda_visible_devices:
|
|
3219
3360
|
cuda_visible_devices = original_cuda_visible_devices.split(",")
|
|
@@ -3222,7 +3363,11 @@ def temp_set_cuda_visible_devices(gpu_id: int):
|
|
|
3222
3363
|
|
|
3223
3364
|
str_gpu_id = cuda_visible_devices[gpu_id] if cuda_visible_devices else str(gpu_id)
|
|
3224
3365
|
os.environ["CUDA_VISIBLE_DEVICES"] = str_gpu_id
|
|
3225
|
-
|
|
3366
|
+
|
|
3367
|
+
logger.debug(f"Set CUDA_VISIBLE_DEVICES to {str_gpu_id}")
|
|
3368
|
+
|
|
3369
|
+
yield 0
|
|
3370
|
+
|
|
3226
3371
|
if original_cuda_visible_devices:
|
|
3227
3372
|
os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible_devices
|
|
3228
3373
|
else:
|
|
@@ -16,9 +16,10 @@
|
|
|
16
16
|
import contextlib
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
|
+
import tempfile
|
|
19
20
|
import warnings
|
|
20
21
|
from pathlib import Path
|
|
21
|
-
from typing import Any, Dict, Optional, Type, Union
|
|
22
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
22
23
|
|
|
23
24
|
import torch
|
|
24
25
|
from huggingface_hub import snapshot_download
|
|
@@ -45,27 +46,37 @@ from sglang.srt.configs import (
|
|
|
45
46
|
KimiVLConfig,
|
|
46
47
|
LongcatFlashConfig,
|
|
47
48
|
MultiModalityConfig,
|
|
49
|
+
NemotronHConfig,
|
|
50
|
+
Olmo3Config,
|
|
48
51
|
Qwen3NextConfig,
|
|
49
52
|
Step3VLConfig,
|
|
50
53
|
)
|
|
54
|
+
from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
|
|
51
55
|
from sglang.srt.configs.internvl import InternVLChatConfig
|
|
52
56
|
from sglang.srt.connector import create_remote_connector
|
|
53
57
|
from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
|
|
54
58
|
|
|
55
|
-
_CONFIG_REGISTRY:
|
|
56
|
-
ChatGLMConfig
|
|
57
|
-
DbrxConfig
|
|
58
|
-
ExaoneConfig
|
|
59
|
-
DeepseekVL2Config
|
|
60
|
-
MultiModalityConfig
|
|
61
|
-
KimiVLConfig
|
|
62
|
-
InternVLChatConfig
|
|
63
|
-
Step3VLConfig
|
|
64
|
-
LongcatFlashConfig
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
59
|
+
_CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
|
|
60
|
+
ChatGLMConfig,
|
|
61
|
+
DbrxConfig,
|
|
62
|
+
ExaoneConfig,
|
|
63
|
+
DeepseekVL2Config,
|
|
64
|
+
MultiModalityConfig,
|
|
65
|
+
KimiVLConfig,
|
|
66
|
+
InternVLChatConfig,
|
|
67
|
+
Step3VLConfig,
|
|
68
|
+
LongcatFlashConfig,
|
|
69
|
+
Olmo3Config,
|
|
70
|
+
Qwen3NextConfig,
|
|
71
|
+
FalconH1Config,
|
|
72
|
+
DotsVLMConfig,
|
|
73
|
+
DotsOCRConfig,
|
|
74
|
+
NemotronHConfig,
|
|
75
|
+
DeepseekVLV2Config,
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
_CONFIG_REGISTRY = {
|
|
79
|
+
config_cls.model_type: config_cls for config_cls in _CONFIG_REGISTRY
|
|
69
80
|
}
|
|
70
81
|
|
|
71
82
|
for name, cls in _CONFIG_REGISTRY.items():
|
|
@@ -106,6 +117,12 @@ def get_hf_text_config(config: PretrainedConfig):
|
|
|
106
117
|
# if transformers config doesn't align with this assumption.
|
|
107
118
|
assert hasattr(config.text_config, "num_attention_heads")
|
|
108
119
|
return config.text_config
|
|
120
|
+
|
|
121
|
+
if hasattr(config, "llm_config"):
|
|
122
|
+
# PointsV1.5 Chat Model
|
|
123
|
+
assert hasattr(config.llm_config, "num_attention_heads")
|
|
124
|
+
return config.llm_config
|
|
125
|
+
|
|
109
126
|
if hasattr(config, "language_config"):
|
|
110
127
|
return config.language_config
|
|
111
128
|
if hasattr(config, "thinker_config"):
|
|
@@ -143,7 +160,7 @@ def _load_deepseek_v32_model(
|
|
|
143
160
|
config_json["architectures"] = ["DeepseekV3ForCausalLM"]
|
|
144
161
|
config_json["model_type"] = "deepseek_v3"
|
|
145
162
|
|
|
146
|
-
tmp_path = os.path.join(
|
|
163
|
+
tmp_path = os.path.join(tempfile.gettempdir(), "_tmp_config_folder")
|
|
147
164
|
os.makedirs(tmp_path, exist_ok=True)
|
|
148
165
|
|
|
149
166
|
unique_path = os.path.join(tmp_path, f"deepseek_v32_{os.getpid()}")
|
|
@@ -180,6 +197,11 @@ def get_config(
|
|
|
180
197
|
config = AutoConfig.from_pretrained(
|
|
181
198
|
model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
|
|
182
199
|
)
|
|
200
|
+
if "deepseek-ai/DeepSeek-OCR" in model:
|
|
201
|
+
config.model_type = "deepseek-ocr"
|
|
202
|
+
# Due to an unknown reason, Hugging Face’s AutoConfig mistakenly recognizes the configuration of deepseek-ocr as deepseekvl2.
|
|
203
|
+
# This is a temporary workaround and will require further optimization.
|
|
204
|
+
|
|
183
205
|
except ValueError as e:
|
|
184
206
|
if not "deepseek_v32" in str(e):
|
|
185
207
|
raise e
|
|
@@ -202,7 +224,8 @@ def get_config(
|
|
|
202
224
|
"intermediate_size": 4304,
|
|
203
225
|
"model_type": "siglip_vision_model",
|
|
204
226
|
"num_attention_heads": 16,
|
|
205
|
-
"num_hidden_layers": 26,
|
|
227
|
+
"num_hidden_layers": 26,
|
|
228
|
+
# Model is originally 27-layer, we only need the first 26 layers for feature extraction.
|
|
206
229
|
"patch_size": 14,
|
|
207
230
|
}
|
|
208
231
|
config.vision_config = SiglipVisionConfig(**vision_config)
|
|
@@ -11,14 +11,14 @@ from sglang.srt.distributed.naive_distributed import (
|
|
|
11
11
|
get_naive_distributed,
|
|
12
12
|
set_naive_distributed,
|
|
13
13
|
)
|
|
14
|
-
from sglang.srt.
|
|
14
|
+
from sglang.srt.layers.parameter import ModelWeightParameter
|
|
15
|
+
from sglang.srt.server_args import ServerArgs
|
|
16
|
+
from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
|
|
17
|
+
from sglang.srt.utils.host_shared_memory import (
|
|
15
18
|
HostSharedMemoryManager,
|
|
16
19
|
get_host_shared_memory_manager,
|
|
17
20
|
set_host_shared_memory_manager,
|
|
18
21
|
)
|
|
19
|
-
from sglang.srt.layers.parameter import ModelWeightParameter
|
|
20
|
-
from sglang.srt.server_args import ServerArgs
|
|
21
|
-
from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|