sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
- sglang/bench_one_batch_server.py +340 -34
- sglang/bench_serving.py +340 -159
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +9 -2
- sglang/profiler.py +20 -3
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +309 -0
- sglang/srt/configs/load_config.py +33 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +284 -118
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +576 -0
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +6 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -15
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +268 -98
- sglang/srt/disaggregation/decode.py +172 -39
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +203 -555
- sglang/srt/disaggregation/nixl/conn.py +217 -63
- sglang/srt/disaggregation/prefill.py +113 -270
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +203 -97
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +85 -65
- sglang/srt/entrypoints/grpc_server.py +632 -305
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +169 -17
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +327 -34
- sglang/srt/entrypoints/openai/serving_base.py +74 -8
- sglang/srt/entrypoints/openai/serving_chat.py +202 -118
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +20 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +47 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +323 -0
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +21 -16
- sglang/srt/function_call/glm4_moe_detector.py +4 -8
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +61 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +98 -7
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/grpc_request_manager.py +915 -0
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
- sglang/srt/layers/activation.py +11 -7
- sglang/srt/layers/attention/aiter_backend.py +17 -18
- sglang/srt/layers/attention/ascend_backend.py +125 -10
- sglang/srt/layers/attention/attention_registry.py +226 -0
- sglang/srt/layers/attention/base_attn_backend.py +32 -4
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +52 -15
- sglang/srt/layers/attention/flashinfer_backend.py +357 -212
- sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
- sglang/srt/layers/attention/flashmla_backend.py +9 -7
- sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
- sglang/srt/layers/attention/mamba/mamba.py +514 -1
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +23 -0
- sglang/srt/layers/attention/nsa_backend.py +1201 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +249 -42
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
- sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +61 -3
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +19 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +28 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +47 -15
- sglang/srt/layers/linear.py +30 -5
- sglang/srt/layers/logits_processor.py +161 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
- sglang/srt/layers/moe/ep_moe/layer.py +243 -448
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +27 -1
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +86 -20
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +43 -15
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +141 -81
- sglang/srt/layers/quantization/mxfp4.py +17 -34
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -24
- sglang/srt/layers/quantization/w8a8_int8.py +45 -27
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +750 -46
- sglang/srt/layers/sampler.py +84 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +23 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +9 -4
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +33 -7
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +41 -17
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +83 -152
- sglang/srt/managers/data_parallel_controller.py +156 -87
- sglang/srt/managers/detokenizer_manager.py +51 -24
- sglang/srt/managers/io_struct.py +223 -129
- sglang/srt/managers/mm_utils.py +49 -10
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +130 -0
- sglang/srt/managers/schedule_batch.py +340 -529
- sglang/srt/managers/schedule_policy.py +158 -18
- sglang/srt/managers/scheduler.py +665 -620
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
- sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
- sglang/srt/managers/tokenizer_manager.py +462 -226
- sglang/srt/managers/tp_worker.py +217 -156
- sglang/srt/managers/utils.py +79 -47
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +42 -28
- sglang/srt/mem_cache/base_prefix_cache.py +3 -3
- sglang/srt/mem_cache/chunk_cache.py +20 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +38 -0
- sglang/srt/mem_cache/hicache_storage.py +44 -2
- sglang/srt/mem_cache/hiradix_cache.py +134 -34
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +602 -208
- sglang/srt/mem_cache/memory_pool_host.py +134 -183
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +263 -78
- sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +115 -58
- sglang/srt/metrics/collector.py +113 -120
- sglang/srt/metrics/func_timer.py +3 -8
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +81 -36
- sglang/srt/model_executor/forward_batch_info.py +40 -50
- sglang/srt/model_executor/model_runner.py +507 -319
- sglang/srt/model_executor/npu_graph_runner.py +11 -5
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +438 -37
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +200 -27
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +40 -56
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +25 -4
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +793 -235
- sglang/srt/models/dots_ocr.py +171 -0
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +570 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -3
- sglang/srt/models/glm4_moe.py +17 -40
- sglang/srt/models/glm4_moe_nextn.py +4 -4
- sglang/srt/models/glm4v.py +3 -2
- sglang/srt/models/glm4v_moe.py +6 -6
- sglang/srt/models/gpt_oss.py +12 -35
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +4 -2
- sglang/srt/models/llama.py +6 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +6 -23
- sglang/srt/models/longcat_flash_nextn.py +4 -15
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +27 -6
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +5 -5
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +70 -4
- sglang/srt/models/qwen2_vl.py +6 -3
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +50 -38
- sglang/srt/models/qwen3_next.py +43 -21
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +791 -0
- sglang/srt/models/qwen3_vl_moe.py +343 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +268 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +61 -0
- sglang/srt/multimodal/processors/base_processor.py +21 -9
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +2 -4
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +20 -10
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +83 -17
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +36 -23
- sglang/srt/sampling/sampling_params.py +75 -0
- sglang/srt/server_args.py +1300 -338
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +161 -0
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
- sglang/srt/speculative/eagle_info.py +786 -0
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +113 -1270
- sglang/srt/speculative/eagle_worker.py +120 -285
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/ngram_info.py +433 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +49 -0
- sglang/srt/speculative/spec_utils.py +641 -0
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +35 -18
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/{utils.py → utils/common.py} +583 -113
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +120 -11
- sglang/test/runners.py +3 -1
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +8 -2
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +3 -4
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +430 -0
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +93 -1
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +432 -16
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
- sglang/srt/entrypoints/grpc_request_manager.py +0 -580
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
|
|
8
|
+
from sglang.srt.managers.schedule_batch import ServerArgs
|
|
9
|
+
from sglang.srt.utils import is_cpu, is_cuda
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ElasticEPState:
|
|
14
|
+
active_ranks: Optional[torch.Tensor]
|
|
15
|
+
last_active_ranks: Optional[torch.Tensor]
|
|
16
|
+
active_ranks_cpu: Optional[torch.Tensor]
|
|
17
|
+
|
|
18
|
+
def is_active_equal_last(self) -> bool:
|
|
19
|
+
return torch.equal(self.active_ranks, self.last_active_ranks)
|
|
20
|
+
|
|
21
|
+
def sync_active_to_cpu(self):
|
|
22
|
+
if self.active_ranks is not None:
|
|
23
|
+
self.active_ranks_cpu = self.active_ranks.detach().cpu().clone()
|
|
24
|
+
|
|
25
|
+
def snapshot_active_to_last(self):
|
|
26
|
+
if self.active_ranks is not None:
|
|
27
|
+
self.last_active_ranks = self.active_ranks.clone()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ElasticEPStateManager:
|
|
31
|
+
_instance: Optional[ElasticEPState] = None
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def instance(cls) -> ElasticEPState:
|
|
35
|
+
return cls._instance
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def init(cls, server_args: ServerArgs):
|
|
39
|
+
if cls._instance is not None:
|
|
40
|
+
return cls._instance
|
|
41
|
+
|
|
42
|
+
if server_args.elastic_ep_backend is not None:
|
|
43
|
+
cls._instance = cls._build_state(ep_size=None, device=None)
|
|
44
|
+
return cls._instance
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _select_device() -> torch.device:
|
|
48
|
+
if is_cuda():
|
|
49
|
+
return torch.device("cuda")
|
|
50
|
+
elif is_cpu():
|
|
51
|
+
return torch.device("cpu")
|
|
52
|
+
else:
|
|
53
|
+
raise NotImplementedError("Only CUDA and CPU support elastic ep now.")
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def _build_state(
|
|
57
|
+
cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
|
|
58
|
+
) -> ElasticEPState:
|
|
59
|
+
|
|
60
|
+
active = cls.healthy_rank_state(ep_size=ep_size, device=device)
|
|
61
|
+
return ElasticEPState(
|
|
62
|
+
active_ranks=active,
|
|
63
|
+
last_active_ranks=active.clone(),
|
|
64
|
+
active_ranks_cpu=active.detach().cpu().clone(),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def healthy_rank_state(
|
|
69
|
+
cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
|
|
70
|
+
) -> torch.Tensor:
|
|
71
|
+
size = ep_size if ep_size is not None else torch.distributed.get_world_size()
|
|
72
|
+
dev = device if device is not None else cls._select_device()
|
|
73
|
+
|
|
74
|
+
return torch.ones(size, dtype=torch.int32, device=dev)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
|
2
2
|
# Copied from vLLM
|
|
3
|
-
import json
|
|
4
3
|
import logging
|
|
5
4
|
from abc import ABC, abstractmethod
|
|
6
5
|
from typing import Union
|
|
7
6
|
|
|
7
|
+
import orjson
|
|
8
|
+
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
10
11
|
try:
|
|
@@ -148,7 +149,7 @@ class HarmonyContext(ConversationContext):
|
|
|
148
149
|
if isinstance(tool_session, Tool):
|
|
149
150
|
return await tool_session.get_result(self)
|
|
150
151
|
tool_name = last_msg.recipient.split(".")[1]
|
|
151
|
-
args =
|
|
152
|
+
args = orjson.loads(last_msg.content[0].text)
|
|
152
153
|
result = await tool_session.call_tool(tool_name, args)
|
|
153
154
|
result_str = result.content[0].text
|
|
154
155
|
content = TextContent(text=result_str)
|
sglang/srt/entrypoints/engine.py
CHANGED
|
@@ -30,8 +30,6 @@ import time
|
|
|
30
30
|
from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
|
|
31
31
|
|
|
32
32
|
import zmq
|
|
33
|
-
import zmq.asyncio
|
|
34
|
-
from PIL.Image import Image
|
|
35
33
|
|
|
36
34
|
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
|
37
35
|
|
|
@@ -47,6 +45,7 @@ from sglang.srt.managers.data_parallel_controller import (
|
|
|
47
45
|
)
|
|
48
46
|
from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
|
|
49
47
|
from sglang.srt.managers.io_struct import (
|
|
48
|
+
DestroyWeightsUpdateGroupReqInput,
|
|
50
49
|
EmbeddingReqInput,
|
|
51
50
|
GenerateReqInput,
|
|
52
51
|
GetWeightsByNameReqInput,
|
|
@@ -60,6 +59,7 @@ from sglang.srt.managers.io_struct import (
|
|
|
60
59
|
UnloadLoRAAdapterReqInput,
|
|
61
60
|
UpdateWeightFromDiskReqInput,
|
|
62
61
|
UpdateWeightsFromDistributedReqInput,
|
|
62
|
+
UpdateWeightsFromIPCReqInput,
|
|
63
63
|
UpdateWeightsFromTensorReqInput,
|
|
64
64
|
)
|
|
65
65
|
from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
|
|
@@ -67,7 +67,6 @@ from sglang.srt.managers.scheduler import run_scheduler_process
|
|
|
67
67
|
from sglang.srt.managers.template_manager import TemplateManager
|
|
68
68
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
|
69
69
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
|
70
|
-
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
|
71
70
|
from sglang.srt.utils import (
|
|
72
71
|
MultiprocessingSerializer,
|
|
73
72
|
assert_pkg_version,
|
|
@@ -77,10 +76,12 @@ from sglang.srt.utils import (
|
|
|
77
76
|
is_cuda,
|
|
78
77
|
kill_process_tree,
|
|
79
78
|
launch_dummy_health_check_server,
|
|
79
|
+
maybe_reindex_device_id,
|
|
80
80
|
prepare_model_and_tokenizer,
|
|
81
81
|
set_prometheus_multiproc_dir,
|
|
82
82
|
set_ulimit,
|
|
83
83
|
)
|
|
84
|
+
from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
|
84
85
|
from sglang.version import __version__
|
|
85
86
|
|
|
86
87
|
logger = logging.getLogger(__name__)
|
|
@@ -146,6 +147,12 @@ class Engine(EngineBase):
|
|
|
146
147
|
thread_label = "Tokenizer"
|
|
147
148
|
trace_set_thread_info(thread_label)
|
|
148
149
|
|
|
150
|
+
try:
|
|
151
|
+
self.loop = asyncio.get_running_loop()
|
|
152
|
+
except RuntimeError:
|
|
153
|
+
self.loop = asyncio.new_event_loop()
|
|
154
|
+
asyncio.set_event_loop(self.loop)
|
|
155
|
+
|
|
149
156
|
def generate(
|
|
150
157
|
self,
|
|
151
158
|
# The input prompt. It can be a single prompt or a batch of prompts.
|
|
@@ -209,7 +216,6 @@ class Engine(EngineBase):
|
|
|
209
216
|
bootstrap_room=bootstrap_room,
|
|
210
217
|
data_parallel_rank=data_parallel_rank,
|
|
211
218
|
)
|
|
212
|
-
loop = asyncio.get_event_loop()
|
|
213
219
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
|
214
220
|
|
|
215
221
|
if stream:
|
|
@@ -217,14 +223,14 @@ class Engine(EngineBase):
|
|
|
217
223
|
def generator_wrapper():
|
|
218
224
|
while True:
|
|
219
225
|
try:
|
|
220
|
-
chunk = loop.run_until_complete(generator.__anext__())
|
|
226
|
+
chunk = self.loop.run_until_complete(generator.__anext__())
|
|
221
227
|
yield chunk
|
|
222
228
|
except StopAsyncIteration:
|
|
223
229
|
break
|
|
224
230
|
|
|
225
231
|
return generator_wrapper()
|
|
226
232
|
else:
|
|
227
|
-
ret = loop.run_until_complete(generator.__anext__())
|
|
233
|
+
ret = self.loop.run_until_complete(generator.__anext__())
|
|
228
234
|
return ret
|
|
229
235
|
|
|
230
236
|
async def async_generate(
|
|
@@ -316,9 +322,8 @@ class Engine(EngineBase):
|
|
|
316
322
|
audio_data=audio_data,
|
|
317
323
|
video_data=video_data,
|
|
318
324
|
)
|
|
319
|
-
loop = asyncio.get_event_loop()
|
|
320
325
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
|
321
|
-
ret = loop.run_until_complete(generator.__anext__())
|
|
326
|
+
ret = self.loop.run_until_complete(generator.__anext__())
|
|
322
327
|
return ret
|
|
323
328
|
|
|
324
329
|
async def async_encode(
|
|
@@ -352,9 +357,8 @@ class Engine(EngineBase):
|
|
|
352
357
|
Please refer to `EmbeddingReqInput` for the documentation.
|
|
353
358
|
"""
|
|
354
359
|
obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
|
|
355
|
-
loop = asyncio.get_event_loop()
|
|
356
360
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
|
357
|
-
ret = loop.run_until_complete(generator.__anext__())
|
|
361
|
+
ret = self.loop.run_until_complete(generator.__anext__())
|
|
358
362
|
return ret
|
|
359
363
|
|
|
360
364
|
def shutdown(self):
|
|
@@ -369,38 +373,31 @@ class Engine(EngineBase):
|
|
|
369
373
|
return False
|
|
370
374
|
|
|
371
375
|
def flush_cache(self):
|
|
372
|
-
|
|
373
|
-
return loop.run_until_complete(self.tokenizer_manager.flush_cache())
|
|
376
|
+
return self.loop.run_until_complete(self.tokenizer_manager.flush_cache())
|
|
374
377
|
|
|
375
378
|
def start_profile(self, **kwargs):
|
|
376
|
-
loop
|
|
377
|
-
loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
|
|
379
|
+
self.loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
|
|
378
380
|
|
|
379
381
|
def stop_profile(self):
|
|
380
|
-
loop
|
|
381
|
-
loop.run_until_complete(self.tokenizer_manager.stop_profile())
|
|
382
|
+
self.loop.run_until_complete(self.tokenizer_manager.stop_profile())
|
|
382
383
|
|
|
383
384
|
def start_expert_distribution_record(self):
|
|
384
|
-
loop
|
|
385
|
-
loop.run_until_complete(
|
|
385
|
+
self.loop.run_until_complete(
|
|
386
386
|
self.tokenizer_manager.start_expert_distribution_record()
|
|
387
387
|
)
|
|
388
388
|
|
|
389
389
|
def stop_expert_distribution_record(self):
|
|
390
|
-
loop
|
|
391
|
-
loop.run_until_complete(
|
|
390
|
+
self.loop.run_until_complete(
|
|
392
391
|
self.tokenizer_manager.stop_expert_distribution_record()
|
|
393
392
|
)
|
|
394
393
|
|
|
395
394
|
def dump_expert_distribution_record(self):
|
|
396
|
-
loop
|
|
397
|
-
loop.run_until_complete(
|
|
395
|
+
self.loop.run_until_complete(
|
|
398
396
|
self.tokenizer_manager.dump_expert_distribution_record()
|
|
399
397
|
)
|
|
400
398
|
|
|
401
399
|
def get_server_info(self):
|
|
402
|
-
|
|
403
|
-
internal_states = loop.run_until_complete(
|
|
400
|
+
internal_states = self.loop.run_until_complete(
|
|
404
401
|
self.tokenizer_manager.get_internal_state()
|
|
405
402
|
)
|
|
406
403
|
return {
|
|
@@ -428,11 +425,22 @@ class Engine(EngineBase):
|
|
|
428
425
|
group_name=group_name,
|
|
429
426
|
backend=backend,
|
|
430
427
|
)
|
|
431
|
-
|
|
432
|
-
return loop.run_until_complete(
|
|
428
|
+
return self.loop.run_until_complete(
|
|
433
429
|
self.tokenizer_manager.init_weights_update_group(obj, None)
|
|
434
430
|
)
|
|
435
431
|
|
|
432
|
+
def destroy_weights_update_group(
|
|
433
|
+
self,
|
|
434
|
+
group_name: str,
|
|
435
|
+
):
|
|
436
|
+
"""Destroy parameter update group."""
|
|
437
|
+
obj = DestroyWeightsUpdateGroupReqInput(
|
|
438
|
+
group_name=group_name,
|
|
439
|
+
)
|
|
440
|
+
return self.loop.run_until_complete(
|
|
441
|
+
self.tokenizer_manager.destroy_weights_update_group(obj, None)
|
|
442
|
+
)
|
|
443
|
+
|
|
436
444
|
def update_weights_from_distributed(
|
|
437
445
|
self,
|
|
438
446
|
names: list[str],
|
|
@@ -449,8 +457,7 @@ class Engine(EngineBase):
|
|
|
449
457
|
group_name=group_name,
|
|
450
458
|
flush_cache=flush_cache,
|
|
451
459
|
)
|
|
452
|
-
|
|
453
|
-
return loop.run_until_complete(
|
|
460
|
+
return self.loop.run_until_complete(
|
|
454
461
|
self.tokenizer_manager.update_weights_from_distributed(obj, None)
|
|
455
462
|
)
|
|
456
463
|
|
|
@@ -474,9 +481,7 @@ class Engine(EngineBase):
|
|
|
474
481
|
load_format=load_format,
|
|
475
482
|
flush_cache=flush_cache,
|
|
476
483
|
)
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
return loop.run_until_complete(
|
|
484
|
+
return self.loop.run_until_complete(
|
|
480
485
|
self.tokenizer_manager.update_weights_from_tensor(obj, None)
|
|
481
486
|
)
|
|
482
487
|
|
|
@@ -496,16 +501,14 @@ class Engine(EngineBase):
|
|
|
496
501
|
load_format=load_format,
|
|
497
502
|
)
|
|
498
503
|
|
|
499
|
-
|
|
500
|
-
return loop.run_until_complete(
|
|
504
|
+
return self.loop.run_until_complete(
|
|
501
505
|
self.tokenizer_manager.update_weights_from_disk(obj, None)
|
|
502
506
|
)
|
|
503
507
|
|
|
504
508
|
def get_weights_by_name(self, name: str, truncate_size: int = 100):
|
|
505
509
|
"""Get weights by parameter name."""
|
|
506
510
|
obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
|
|
507
|
-
|
|
508
|
-
return loop.run_until_complete(
|
|
511
|
+
return self.loop.run_until_complete(
|
|
509
512
|
self.tokenizer_manager.get_weights_by_name(obj, None)
|
|
510
513
|
)
|
|
511
514
|
|
|
@@ -518,8 +521,7 @@ class Engine(EngineBase):
|
|
|
518
521
|
pinned=pinned,
|
|
519
522
|
)
|
|
520
523
|
|
|
521
|
-
|
|
522
|
-
return loop.run_until_complete(
|
|
524
|
+
return self.loop.run_until_complete(
|
|
523
525
|
self.tokenizer_manager.load_lora_adapter(obj, None)
|
|
524
526
|
)
|
|
525
527
|
|
|
@@ -528,22 +530,19 @@ class Engine(EngineBase):
|
|
|
528
530
|
|
|
529
531
|
obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
|
|
530
532
|
|
|
531
|
-
|
|
532
|
-
return loop.run_until_complete(
|
|
533
|
+
return self.loop.run_until_complete(
|
|
533
534
|
self.tokenizer_manager.unload_lora_adapter(obj, None)
|
|
534
535
|
)
|
|
535
536
|
|
|
536
537
|
def release_memory_occupation(self, tags: Optional[List[str]] = None):
|
|
537
538
|
obj = ReleaseMemoryOccupationReqInput(tags=tags)
|
|
538
|
-
|
|
539
|
-
return loop.run_until_complete(
|
|
539
|
+
return self.loop.run_until_complete(
|
|
540
540
|
self.tokenizer_manager.release_memory_occupation(obj, None)
|
|
541
541
|
)
|
|
542
542
|
|
|
543
543
|
def resume_memory_occupation(self, tags: Optional[List[str]] = None):
|
|
544
544
|
obj = ResumeMemoryOccupationReqInput(tags=tags)
|
|
545
|
-
|
|
546
|
-
return loop.run_until_complete(
|
|
545
|
+
return self.loop.run_until_complete(
|
|
547
546
|
self.tokenizer_manager.resume_memory_occupation(obj, None)
|
|
548
547
|
)
|
|
549
548
|
|
|
@@ -560,8 +559,7 @@ class Engine(EngineBase):
|
|
|
560
559
|
collection.
|
|
561
560
|
"""
|
|
562
561
|
|
|
563
|
-
loop
|
|
564
|
-
loop.run_until_complete(self.tokenizer_manager.freeze_gc())
|
|
562
|
+
self.loop.run_until_complete(self.tokenizer_manager.freeze_gc())
|
|
565
563
|
|
|
566
564
|
"""
|
|
567
565
|
Execute an RPC call on all scheduler processes.
|
|
@@ -619,8 +617,7 @@ class Engine(EngineBase):
|
|
|
619
617
|
ValueError: If query is not provided, or if items is not provided,
|
|
620
618
|
or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
|
|
621
619
|
"""
|
|
622
|
-
|
|
623
|
-
return loop.run_until_complete(
|
|
620
|
+
return self.loop.run_until_complete(
|
|
624
621
|
self.tokenizer_manager.score_request(
|
|
625
622
|
query=query,
|
|
626
623
|
items=items,
|
|
@@ -653,6 +650,21 @@ class Engine(EngineBase):
|
|
|
653
650
|
request=None,
|
|
654
651
|
)
|
|
655
652
|
|
|
653
|
+
def update_weights_from_ipc(
|
|
654
|
+
self,
|
|
655
|
+
zmq_handles: Dict[str, str],
|
|
656
|
+
flush_cache: bool = True,
|
|
657
|
+
):
|
|
658
|
+
"""Update weights from IPC for checkpoint-engine integration."""
|
|
659
|
+
obj = UpdateWeightsFromIPCReqInput(
|
|
660
|
+
zmq_handles=zmq_handles,
|
|
661
|
+
flush_cache=flush_cache,
|
|
662
|
+
)
|
|
663
|
+
loop = asyncio.get_event_loop()
|
|
664
|
+
return loop.run_until_complete(
|
|
665
|
+
self.tokenizer_manager.update_weights_from_ipc(obj, None)
|
|
666
|
+
)
|
|
667
|
+
|
|
656
668
|
|
|
657
669
|
def _set_envs_and_config(server_args: ServerArgs):
|
|
658
670
|
# Set global environments
|
|
@@ -666,6 +678,13 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
|
666
678
|
if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
|
|
667
679
|
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
|
668
680
|
|
|
681
|
+
if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
|
|
682
|
+
# Default to warning level, to avoid too many logs
|
|
683
|
+
os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
|
|
684
|
+
if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
|
|
685
|
+
# Need to set log to console, otherwise the log level won't take effect
|
|
686
|
+
os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
|
|
687
|
+
|
|
669
688
|
# Can also be passed as argument
|
|
670
689
|
os.environ["SGLANG_RUN_ID"] = (
|
|
671
690
|
f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
|
|
@@ -682,7 +701,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
|
682
701
|
if server_args.attention_backend == "flashinfer":
|
|
683
702
|
assert_pkg_version(
|
|
684
703
|
"flashinfer_python",
|
|
685
|
-
"0.
|
|
704
|
+
"0.4.1",
|
|
686
705
|
"Please uninstall the old version and "
|
|
687
706
|
"reinstall the latest version by following the instructions "
|
|
688
707
|
"at https://docs.flashinfer.ai/installation.html.",
|
|
@@ -690,7 +709,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
|
690
709
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
|
691
710
|
assert_pkg_version(
|
|
692
711
|
"sgl-kernel",
|
|
693
|
-
"0.3.
|
|
712
|
+
"0.3.16.post3",
|
|
694
713
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
|
695
714
|
)
|
|
696
715
|
|
|
@@ -780,23 +799,24 @@ def _launch_subprocesses(
|
|
|
780
799
|
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
|
781
800
|
)
|
|
782
801
|
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
|
783
|
-
proc = mp.Process(
|
|
784
|
-
target=run_scheduler_process,
|
|
785
|
-
args=(
|
|
786
|
-
server_args,
|
|
787
|
-
port_args,
|
|
788
|
-
gpu_id,
|
|
789
|
-
tp_rank,
|
|
790
|
-
moe_ep_rank,
|
|
791
|
-
pp_rank,
|
|
792
|
-
None,
|
|
793
|
-
writer,
|
|
794
|
-
None,
|
|
795
|
-
),
|
|
796
|
-
)
|
|
797
802
|
|
|
798
|
-
with
|
|
799
|
-
proc.
|
|
803
|
+
with maybe_reindex_device_id(gpu_id) as gpu_id:
|
|
804
|
+
proc = mp.Process(
|
|
805
|
+
target=run_scheduler_process,
|
|
806
|
+
args=(
|
|
807
|
+
server_args,
|
|
808
|
+
port_args,
|
|
809
|
+
gpu_id,
|
|
810
|
+
tp_rank,
|
|
811
|
+
moe_ep_rank,
|
|
812
|
+
pp_rank,
|
|
813
|
+
None,
|
|
814
|
+
writer,
|
|
815
|
+
),
|
|
816
|
+
)
|
|
817
|
+
with memory_saver_adapter.configure_subprocess():
|
|
818
|
+
proc.start()
|
|
819
|
+
|
|
800
820
|
scheduler_procs.append(proc)
|
|
801
821
|
scheduler_pipe_readers.append(reader)
|
|
802
822
|
else:
|