sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
- sglang/bench_one_batch_server.py +340 -34
- sglang/bench_serving.py +340 -159
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +9 -2
- sglang/profiler.py +20 -3
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +309 -0
- sglang/srt/configs/load_config.py +33 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +284 -118
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +576 -0
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +6 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -15
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +268 -98
- sglang/srt/disaggregation/decode.py +172 -39
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +203 -555
- sglang/srt/disaggregation/nixl/conn.py +217 -63
- sglang/srt/disaggregation/prefill.py +113 -270
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +203 -97
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +85 -65
- sglang/srt/entrypoints/grpc_server.py +632 -305
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +169 -17
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +327 -34
- sglang/srt/entrypoints/openai/serving_base.py +74 -8
- sglang/srt/entrypoints/openai/serving_chat.py +202 -118
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +20 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +47 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +323 -0
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +21 -16
- sglang/srt/function_call/glm4_moe_detector.py +4 -8
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +61 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +98 -7
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/grpc_request_manager.py +915 -0
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
- sglang/srt/layers/activation.py +11 -7
- sglang/srt/layers/attention/aiter_backend.py +17 -18
- sglang/srt/layers/attention/ascend_backend.py +125 -10
- sglang/srt/layers/attention/attention_registry.py +226 -0
- sglang/srt/layers/attention/base_attn_backend.py +32 -4
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +52 -15
- sglang/srt/layers/attention/flashinfer_backend.py +357 -212
- sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
- sglang/srt/layers/attention/flashmla_backend.py +9 -7
- sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
- sglang/srt/layers/attention/mamba/mamba.py +514 -1
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +23 -0
- sglang/srt/layers/attention/nsa_backend.py +1201 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +249 -42
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
- sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +61 -3
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +19 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +28 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +47 -15
- sglang/srt/layers/linear.py +30 -5
- sglang/srt/layers/logits_processor.py +161 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
- sglang/srt/layers/moe/ep_moe/layer.py +243 -448
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +27 -1
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +86 -20
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +43 -15
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +141 -81
- sglang/srt/layers/quantization/mxfp4.py +17 -34
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -24
- sglang/srt/layers/quantization/w8a8_int8.py +45 -27
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +750 -46
- sglang/srt/layers/sampler.py +84 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +23 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +9 -4
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +33 -7
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +41 -17
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +83 -152
- sglang/srt/managers/data_parallel_controller.py +156 -87
- sglang/srt/managers/detokenizer_manager.py +51 -24
- sglang/srt/managers/io_struct.py +223 -129
- sglang/srt/managers/mm_utils.py +49 -10
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +130 -0
- sglang/srt/managers/schedule_batch.py +340 -529
- sglang/srt/managers/schedule_policy.py +158 -18
- sglang/srt/managers/scheduler.py +665 -620
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
- sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
- sglang/srt/managers/tokenizer_manager.py +462 -226
- sglang/srt/managers/tp_worker.py +217 -156
- sglang/srt/managers/utils.py +79 -47
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +42 -28
- sglang/srt/mem_cache/base_prefix_cache.py +3 -3
- sglang/srt/mem_cache/chunk_cache.py +20 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +38 -0
- sglang/srt/mem_cache/hicache_storage.py +44 -2
- sglang/srt/mem_cache/hiradix_cache.py +134 -34
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +602 -208
- sglang/srt/mem_cache/memory_pool_host.py +134 -183
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +263 -78
- sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +115 -58
- sglang/srt/metrics/collector.py +113 -120
- sglang/srt/metrics/func_timer.py +3 -8
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +81 -36
- sglang/srt/model_executor/forward_batch_info.py +40 -50
- sglang/srt/model_executor/model_runner.py +507 -319
- sglang/srt/model_executor/npu_graph_runner.py +11 -5
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +438 -37
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +200 -27
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +40 -56
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +25 -4
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +793 -235
- sglang/srt/models/dots_ocr.py +171 -0
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +570 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -3
- sglang/srt/models/glm4_moe.py +17 -40
- sglang/srt/models/glm4_moe_nextn.py +4 -4
- sglang/srt/models/glm4v.py +3 -2
- sglang/srt/models/glm4v_moe.py +6 -6
- sglang/srt/models/gpt_oss.py +12 -35
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +4 -2
- sglang/srt/models/llama.py +6 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +6 -23
- sglang/srt/models/longcat_flash_nextn.py +4 -15
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +27 -6
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +5 -5
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +70 -4
- sglang/srt/models/qwen2_vl.py +6 -3
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +50 -38
- sglang/srt/models/qwen3_next.py +43 -21
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +791 -0
- sglang/srt/models/qwen3_vl_moe.py +343 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +268 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +61 -0
- sglang/srt/multimodal/processors/base_processor.py +21 -9
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +2 -4
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +20 -10
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +83 -17
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +36 -23
- sglang/srt/sampling/sampling_params.py +75 -0
- sglang/srt/server_args.py +1300 -338
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +161 -0
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
- sglang/srt/speculative/eagle_info.py +786 -0
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +113 -1270
- sglang/srt/speculative/eagle_worker.py +120 -285
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/ngram_info.py +433 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +49 -0
- sglang/srt/speculative/spec_utils.py +641 -0
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +35 -18
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/{utils.py → utils/common.py} +583 -113
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +120 -11
- sglang/test/runners.py +3 -1
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +8 -2
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +3 -4
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +430 -0
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +93 -1
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +432 -16
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
- sglang/srt/entrypoints/grpc_request_manager.py +0 -580
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
# Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
|
|
4
4
|
# Slight differences in processing chat messages
|
|
5
5
|
import datetime
|
|
6
|
-
import json
|
|
7
6
|
from collections.abc import Iterable
|
|
8
7
|
from typing import Literal, Optional, Union
|
|
9
8
|
|
|
9
|
+
import orjson
|
|
10
10
|
from openai.types.responses import (
|
|
11
11
|
ResponseOutputItem,
|
|
12
12
|
ResponseOutputMessage,
|
|
@@ -228,7 +228,7 @@ def parse_output_message(message: Message):
|
|
|
228
228
|
if len(message.content) != 1:
|
|
229
229
|
raise ValueError("Invalid number of contents in browser message")
|
|
230
230
|
content = message.content[0]
|
|
231
|
-
browser_call =
|
|
231
|
+
browser_call = orjson.loads(content.text)
|
|
232
232
|
# TODO: translate to url properly!
|
|
233
233
|
if recipient == "browser.search":
|
|
234
234
|
action = ActionSearch(
|
|
@@ -19,7 +19,6 @@ This file implements HTTP APIs for the inference engine via fastapi.
|
|
|
19
19
|
|
|
20
20
|
import asyncio
|
|
21
21
|
import dataclasses
|
|
22
|
-
import json
|
|
23
22
|
import logging
|
|
24
23
|
import multiprocessing as multiprocessing
|
|
25
24
|
import os
|
|
@@ -29,8 +28,6 @@ import time
|
|
|
29
28
|
from http import HTTPStatus
|
|
30
29
|
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
|
|
31
30
|
|
|
32
|
-
import setproctitle
|
|
33
|
-
|
|
34
31
|
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
|
35
32
|
|
|
36
33
|
# Fix a bug of Python threading
|
|
@@ -53,25 +50,34 @@ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationM
|
|
|
53
50
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
|
54
51
|
from sglang.srt.entrypoints.openai.protocol import (
|
|
55
52
|
ChatCompletionRequest,
|
|
53
|
+
ClassifyRequest,
|
|
56
54
|
CompletionRequest,
|
|
55
|
+
DetokenizeRequest,
|
|
57
56
|
EmbeddingRequest,
|
|
58
57
|
ErrorResponse,
|
|
59
58
|
ModelCard,
|
|
60
59
|
ModelList,
|
|
61
60
|
ResponsesRequest,
|
|
62
61
|
ScoringRequest,
|
|
62
|
+
TokenizeRequest,
|
|
63
63
|
V1RerankReqInput,
|
|
64
64
|
)
|
|
65
65
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
|
66
|
+
from sglang.srt.entrypoints.openai.serving_classify import OpenAIServingClassify
|
|
66
67
|
from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
|
|
67
68
|
from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
|
68
69
|
from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
|
|
69
70
|
from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
|
|
71
|
+
from sglang.srt.entrypoints.openai.serving_tokenize import (
|
|
72
|
+
OpenAIServingDetokenize,
|
|
73
|
+
OpenAIServingTokenize,
|
|
74
|
+
)
|
|
70
75
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
|
71
76
|
from sglang.srt.managers.io_struct import (
|
|
72
77
|
AbortReq,
|
|
73
78
|
CloseSessionReqInput,
|
|
74
79
|
ConfigureLoggingReq,
|
|
80
|
+
DestroyWeightsUpdateGroupReqInput,
|
|
75
81
|
EmbeddingReqInput,
|
|
76
82
|
GenerateReqInput,
|
|
77
83
|
GetWeightsByNameReqInput,
|
|
@@ -90,13 +96,14 @@ from sglang.srt.managers.io_struct import (
|
|
|
90
96
|
UnloadLoRAAdapterReqInput,
|
|
91
97
|
UpdateWeightFromDiskReqInput,
|
|
92
98
|
UpdateWeightsFromDistributedReqInput,
|
|
99
|
+
UpdateWeightsFromIPCReqInput,
|
|
93
100
|
UpdateWeightsFromTensorReqInput,
|
|
94
101
|
UpdateWeightVersionReqInput,
|
|
95
102
|
VertexGenerateReqInput,
|
|
96
103
|
)
|
|
97
104
|
from sglang.srt.managers.multi_tokenizer_mixin import (
|
|
98
|
-
MultiTokenizerManager,
|
|
99
105
|
MultiTokenizerRouter,
|
|
106
|
+
TokenizerWorker,
|
|
100
107
|
get_main_process_id,
|
|
101
108
|
monkey_patch_uvicorn_multiprocessing,
|
|
102
109
|
read_from_shared_memory,
|
|
@@ -123,14 +130,13 @@ logger = logging.getLogger(__name__)
|
|
|
123
130
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
124
131
|
|
|
125
132
|
HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|
133
|
+
WAIT_WEIGHTS_READY_TIMEOUT = int(os.getenv("SGLANG_WAIT_WEIGHTS_READY_TIMEOUT", 120))
|
|
126
134
|
|
|
127
135
|
|
|
128
136
|
# Store global states
|
|
129
137
|
@dataclasses.dataclass
|
|
130
138
|
class _GlobalState:
|
|
131
|
-
tokenizer_manager: Union[
|
|
132
|
-
TokenizerManager, MultiTokenizerRouter, MultiTokenizerManager
|
|
133
|
-
]
|
|
139
|
+
tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
|
|
134
140
|
template_manager: TemplateManager
|
|
135
141
|
scheduler_info: Dict
|
|
136
142
|
|
|
@@ -145,15 +151,14 @@ def set_global_state(global_state: _GlobalState):
|
|
|
145
151
|
|
|
146
152
|
async def init_multi_tokenizer() -> ServerArgs:
|
|
147
153
|
"""Read args information from shm and init tokenizer manager for current process"""
|
|
148
|
-
pid = os.getpid()
|
|
149
|
-
main_pid = get_main_process_id()
|
|
150
|
-
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
|
151
154
|
|
|
152
155
|
# Read configuration from shared memory
|
|
156
|
+
main_pid = get_main_process_id()
|
|
153
157
|
port_args, server_args, scheduler_info = read_from_shared_memory(
|
|
154
158
|
f"multi_tokenizer_args_{main_pid}"
|
|
155
159
|
)
|
|
156
160
|
server_args: ServerArgs
|
|
161
|
+
port_args: PortArgs
|
|
157
162
|
|
|
158
163
|
# API key authentication is not supported in multi-tokenizer mode
|
|
159
164
|
assert (
|
|
@@ -163,9 +168,13 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
|
163
168
|
port_args.tokenizer_ipc_name = (
|
|
164
169
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
|
165
170
|
)
|
|
171
|
+
logger.info(
|
|
172
|
+
f"Start multi-tokenizer worker process {os.getpid()}, "
|
|
173
|
+
f"ipc_name={port_args.tokenizer_ipc_name}"
|
|
174
|
+
)
|
|
166
175
|
|
|
167
176
|
# Launch multi-tokenizer manager process
|
|
168
|
-
tokenizer_manager =
|
|
177
|
+
tokenizer_manager = TokenizerWorker(server_args, port_args)
|
|
169
178
|
template_manager = TemplateManager()
|
|
170
179
|
template_manager.initialize_templates(
|
|
171
180
|
tokenizer_manager=tokenizer_manager,
|
|
@@ -173,8 +182,6 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
|
173
182
|
chat_template=server_args.chat_template,
|
|
174
183
|
completion_template=server_args.completion_template,
|
|
175
184
|
)
|
|
176
|
-
# Register this tokenizer with the main tokenizer manager
|
|
177
|
-
await tokenizer_manager.register_to_main_tokenizer_manager()
|
|
178
185
|
|
|
179
186
|
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
|
180
187
|
set_global_state(
|
|
@@ -226,12 +233,21 @@ async def lifespan(fast_api_app: FastAPI):
|
|
|
226
233
|
fast_api_app.state.openai_serving_embedding = OpenAIServingEmbedding(
|
|
227
234
|
_global_state.tokenizer_manager, _global_state.template_manager
|
|
228
235
|
)
|
|
236
|
+
fast_api_app.state.openai_serving_classify = OpenAIServingClassify(
|
|
237
|
+
_global_state.tokenizer_manager, _global_state.template_manager
|
|
238
|
+
)
|
|
229
239
|
fast_api_app.state.openai_serving_score = OpenAIServingScore(
|
|
230
240
|
_global_state.tokenizer_manager
|
|
231
241
|
)
|
|
232
242
|
fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
|
|
233
243
|
_global_state.tokenizer_manager
|
|
234
244
|
)
|
|
245
|
+
fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
|
|
246
|
+
_global_state.tokenizer_manager
|
|
247
|
+
)
|
|
248
|
+
fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
|
|
249
|
+
_global_state.tokenizer_manager
|
|
250
|
+
)
|
|
235
251
|
|
|
236
252
|
server_args: ServerArgs = fast_api_app.server_args
|
|
237
253
|
|
|
@@ -302,7 +318,23 @@ app.add_middleware(
|
|
|
302
318
|
|
|
303
319
|
@app.exception_handler(HTTPException)
|
|
304
320
|
async def validation_exception_handler(request: Request, exc: HTTPException):
|
|
305
|
-
"""Enrich HTTP exception with status code and other details
|
|
321
|
+
"""Enrich HTTP exception with status code and other details.
|
|
322
|
+
|
|
323
|
+
For /v1/responses, emit OpenAI-style nested error envelope:
|
|
324
|
+
{"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
|
|
325
|
+
"""
|
|
326
|
+
# adjust fmt for responses api
|
|
327
|
+
if request.url.path.startswith("/v1/responses"):
|
|
328
|
+
nested_error = {
|
|
329
|
+
"message": exc.detail,
|
|
330
|
+
"type": HTTPStatus(exc.status_code).phrase,
|
|
331
|
+
"param": None,
|
|
332
|
+
"code": exc.status_code,
|
|
333
|
+
}
|
|
334
|
+
return ORJSONResponse(
|
|
335
|
+
content={"error": nested_error}, status_code=exc.status_code
|
|
336
|
+
)
|
|
337
|
+
|
|
306
338
|
error = ErrorResponse(
|
|
307
339
|
object="error",
|
|
308
340
|
message=exc.detail,
|
|
@@ -315,7 +347,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
|
|
|
315
347
|
# Custom exception handlers to change validation error status codes
|
|
316
348
|
@app.exception_handler(RequestValidationError)
|
|
317
349
|
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
|
318
|
-
"""Override FastAPI's default 422 validation error with 400
|
|
350
|
+
"""Override FastAPI's default 422 validation error with 400.
|
|
351
|
+
|
|
352
|
+
For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
|
|
353
|
+
"""
|
|
319
354
|
exc_str = str(exc)
|
|
320
355
|
errors_str = str(exc.errors())
|
|
321
356
|
|
|
@@ -324,6 +359,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
|
|
|
324
359
|
else:
|
|
325
360
|
message = exc_str
|
|
326
361
|
|
|
362
|
+
if request.url.path.startswith("/v1/responses"):
|
|
363
|
+
# adapt specially, for v1/responses API only (notice the error key is different)
|
|
364
|
+
nested_error = {
|
|
365
|
+
"message": message,
|
|
366
|
+
"type": HTTPStatus.BAD_REQUEST.phrase,
|
|
367
|
+
"param": None,
|
|
368
|
+
"code": HTTPStatus.BAD_REQUEST.value,
|
|
369
|
+
}
|
|
370
|
+
return ORJSONResponse(status_code=400, content={"error": nested_error})
|
|
371
|
+
|
|
327
372
|
err = ErrorResponse(
|
|
328
373
|
message=message,
|
|
329
374
|
type=HTTPStatus.BAD_REQUEST.phrase,
|
|
@@ -468,7 +513,7 @@ async def get_load():
|
|
|
468
513
|
|
|
469
514
|
|
|
470
515
|
# example usage:
|
|
471
|
-
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"
|
|
516
|
+
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
|
|
472
517
|
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
|
473
518
|
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
|
474
519
|
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
|
@@ -517,7 +562,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
|
517
562
|
async def generate_from_file_request(file: UploadFile, request: Request):
|
|
518
563
|
"""Handle a generate request, this is purely to work with input_embeds."""
|
|
519
564
|
content = await file.read()
|
|
520
|
-
input_embeds =
|
|
565
|
+
input_embeds = orjson.loads(content.decode("utf-8"))
|
|
521
566
|
|
|
522
567
|
obj = GenerateReqInput(
|
|
523
568
|
input_embeds=input_embeds,
|
|
@@ -596,6 +641,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
|
|
596
641
|
with_stack=obj.with_stack,
|
|
597
642
|
record_shapes=obj.record_shapes,
|
|
598
643
|
profile_by_stage=obj.profile_by_stage,
|
|
644
|
+
merge_profiles=obj.merge_profiles,
|
|
599
645
|
)
|
|
600
646
|
return Response(
|
|
601
647
|
content="Start profiling.\n",
|
|
@@ -731,6 +777,20 @@ async def init_weights_update_group(
|
|
|
731
777
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
|
732
778
|
|
|
733
779
|
|
|
780
|
+
@app.post("/destroy_weights_update_group")
|
|
781
|
+
async def destroy_weights_update_group(
|
|
782
|
+
obj: DestroyWeightsUpdateGroupReqInput, request: Request
|
|
783
|
+
):
|
|
784
|
+
"""Destroy the parameter update group."""
|
|
785
|
+
success, message = (
|
|
786
|
+
await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
|
|
787
|
+
)
|
|
788
|
+
content = {"success": success, "message": message}
|
|
789
|
+
return ORJSONResponse(
|
|
790
|
+
content, status_code=200 if success else HTTPStatus.BAD_REQUEST
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
|
|
734
794
|
@app.post("/update_weights_from_tensor")
|
|
735
795
|
async def update_weights_from_tensor(
|
|
736
796
|
obj: UpdateWeightsFromTensorReqInput, request: Request
|
|
@@ -780,6 +840,27 @@ async def update_weights_from_distributed(
|
|
|
780
840
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
|
781
841
|
|
|
782
842
|
|
|
843
|
+
@app.post("/update_weights_from_ipc")
|
|
844
|
+
async def update_weights_from_ipc(obj: UpdateWeightsFromIPCReqInput, request: Request):
|
|
845
|
+
"""Update the weights from IPC (Inter-Process Communication) for checkpoint-engine integration."""
|
|
846
|
+
success, message = await _global_state.tokenizer_manager.update_weights_from_ipc(
|
|
847
|
+
obj, request
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
# Update weight version if provided and weights update was successful
|
|
851
|
+
if success and obj.weight_version is not None:
|
|
852
|
+
_update_weight_version_if_provided(obj.weight_version)
|
|
853
|
+
message += f" Weight version updated to {obj.weight_version}."
|
|
854
|
+
|
|
855
|
+
content = {"success": success, "message": message}
|
|
856
|
+
if success:
|
|
857
|
+
if _global_state.tokenizer_manager.initial_weights_loaded is False:
|
|
858
|
+
_global_state.tokenizer_manager.initial_weights_loaded = True
|
|
859
|
+
return ORJSONResponse(content)
|
|
860
|
+
else:
|
|
861
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
|
862
|
+
|
|
863
|
+
|
|
783
864
|
@app.post("/update_weight_version")
|
|
784
865
|
async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
|
|
785
866
|
"""Update the weight version. This operation requires no active requests."""
|
|
@@ -1030,6 +1111,54 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
|
|
|
1030
1111
|
)
|
|
1031
1112
|
|
|
1032
1113
|
|
|
1114
|
+
@app.post(
|
|
1115
|
+
"/v1/classify",
|
|
1116
|
+
response_class=ORJSONResponse,
|
|
1117
|
+
dependencies=[Depends(validate_json_request)],
|
|
1118
|
+
)
|
|
1119
|
+
async def openai_v1_classify(request: ClassifyRequest, raw_request: Request):
|
|
1120
|
+
"""OpenAI-compatible classification endpoint."""
|
|
1121
|
+
return await raw_request.app.state.openai_serving_classify.handle_request(
|
|
1122
|
+
request, raw_request
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
@app.post(
|
|
1127
|
+
"/v1/tokenize",
|
|
1128
|
+
response_class=ORJSONResponse,
|
|
1129
|
+
dependencies=[Depends(validate_json_request)],
|
|
1130
|
+
)
|
|
1131
|
+
@app.post(
|
|
1132
|
+
"/tokenize",
|
|
1133
|
+
response_class=ORJSONResponse,
|
|
1134
|
+
dependencies=[Depends(validate_json_request)],
|
|
1135
|
+
include_in_schema=False,
|
|
1136
|
+
)
|
|
1137
|
+
async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
|
|
1138
|
+
"""OpenAI-compatible tokenization endpoint."""
|
|
1139
|
+
return await raw_request.app.state.openai_serving_tokenize.handle_request(
|
|
1140
|
+
request, raw_request
|
|
1141
|
+
)
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
@app.post(
|
|
1145
|
+
"/v1/detokenize",
|
|
1146
|
+
response_class=ORJSONResponse,
|
|
1147
|
+
dependencies=[Depends(validate_json_request)],
|
|
1148
|
+
)
|
|
1149
|
+
@app.post(
|
|
1150
|
+
"/detokenize",
|
|
1151
|
+
response_class=ORJSONResponse,
|
|
1152
|
+
dependencies=[Depends(validate_json_request)],
|
|
1153
|
+
include_in_schema=False,
|
|
1154
|
+
)
|
|
1155
|
+
async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
|
|
1156
|
+
"""OpenAI-compatible detokenization endpoint."""
|
|
1157
|
+
return await raw_request.app.state.openai_serving_detokenize.handle_request(
|
|
1158
|
+
request, raw_request
|
|
1159
|
+
)
|
|
1160
|
+
|
|
1161
|
+
|
|
1033
1162
|
@app.get("/v1/models", response_class=ORJSONResponse)
|
|
1034
1163
|
async def available_models():
|
|
1035
1164
|
"""Show available models. OpenAI-compatible endpoint."""
|
|
@@ -1424,6 +1553,8 @@ def _wait_and_warmup(
|
|
|
1424
1553
|
pipe_finish_writer: Optional[multiprocessing.connection.Connection],
|
|
1425
1554
|
launch_callback: Optional[Callable[[], None]] = None,
|
|
1426
1555
|
):
|
|
1556
|
+
if server_args.checkpoint_engine_wait_weights_before_ready:
|
|
1557
|
+
_wait_weights_ready()
|
|
1427
1558
|
if not server_args.skip_server_warmup:
|
|
1428
1559
|
if not _execute_server_warmup(
|
|
1429
1560
|
server_args,
|
|
@@ -1446,3 +1577,24 @@ def _wait_and_warmup(
|
|
|
1446
1577
|
|
|
1447
1578
|
if launch_callback is not None:
|
|
1448
1579
|
launch_callback()
|
|
1580
|
+
|
|
1581
|
+
|
|
1582
|
+
def _wait_weights_ready():
|
|
1583
|
+
"""Wait for weights to be ready within the specified timeout."""
|
|
1584
|
+
timeout = WAIT_WEIGHTS_READY_TIMEOUT
|
|
1585
|
+
start_time = time.time()
|
|
1586
|
+
|
|
1587
|
+
for _ in range(timeout):
|
|
1588
|
+
if _global_state.tokenizer_manager.initial_weights_loaded:
|
|
1589
|
+
logger.info(
|
|
1590
|
+
f"Weights are ready after {time.time() - start_time:.2f} seconds"
|
|
1591
|
+
)
|
|
1592
|
+
return
|
|
1593
|
+
time.sleep(1)
|
|
1594
|
+
|
|
1595
|
+
# Timeout reached without weights being ready
|
|
1596
|
+
logger.error(
|
|
1597
|
+
f"Weights are not ready after waiting {timeout} seconds. "
|
|
1598
|
+
f"Consider increasing SGLANG_WAIT_WEIGHTS_READY_TIMEOUT environment variable. "
|
|
1599
|
+
f"Current status: initial_weights_loaded={_global_state.tokenizer_manager.initial_weights_loaded}"
|
|
1600
|
+
)
|
|
@@ -1,15 +1,9 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import dataclasses
|
|
3
1
|
import multiprocessing
|
|
4
|
-
import pickle
|
|
5
|
-
import threading
|
|
6
2
|
import time
|
|
7
|
-
from typing import
|
|
3
|
+
from typing import List, Optional, Tuple
|
|
8
4
|
|
|
9
|
-
import pybase64
|
|
10
5
|
import requests
|
|
11
6
|
import torch
|
|
12
|
-
import torch.distributed as dist
|
|
13
7
|
|
|
14
8
|
from sglang.srt.entrypoints.EngineBase import EngineBase
|
|
15
9
|
from sglang.srt.entrypoints.http_server import launch_server
|