sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -27,9 +27,9 @@ import tempfile
|
|
27
27
|
import threading
|
28
28
|
import time
|
29
29
|
from http import HTTPStatus
|
30
|
-
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
|
31
31
|
|
32
|
-
import
|
32
|
+
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
33
33
|
|
34
34
|
# Fix a bug of Python threading
|
35
35
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
@@ -47,21 +47,19 @@ from fastapi.exceptions import RequestValidationError
|
|
47
47
|
from fastapi.middleware.cors import CORSMiddleware
|
48
48
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
49
49
|
|
50
|
-
from sglang.srt.disaggregation.utils import
|
51
|
-
FAKE_BOOTSTRAP_HOST,
|
52
|
-
DisaggregationMode,
|
53
|
-
register_disaggregation_server,
|
54
|
-
)
|
50
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
55
51
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
56
52
|
from sglang.srt.entrypoints.openai.protocol import (
|
57
53
|
ChatCompletionRequest,
|
58
54
|
CompletionRequest,
|
55
|
+
DetokenizeRequest,
|
59
56
|
EmbeddingRequest,
|
60
57
|
ErrorResponse,
|
61
58
|
ModelCard,
|
62
59
|
ModelList,
|
63
60
|
ResponsesRequest,
|
64
61
|
ScoringRequest,
|
62
|
+
TokenizeRequest,
|
65
63
|
V1RerankReqInput,
|
66
64
|
)
|
67
65
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
@@ -69,14 +67,20 @@ from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompl
|
|
69
67
|
from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
70
68
|
from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
|
71
69
|
from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
|
70
|
+
from sglang.srt.entrypoints.openai.serving_tokenize import (
|
71
|
+
OpenAIServingDetokenize,
|
72
|
+
OpenAIServingTokenize,
|
73
|
+
)
|
72
74
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
73
75
|
from sglang.srt.managers.io_struct import (
|
74
76
|
AbortReq,
|
75
77
|
CloseSessionReqInput,
|
76
78
|
ConfigureLoggingReq,
|
79
|
+
DestroyWeightsUpdateGroupReqInput,
|
77
80
|
EmbeddingReqInput,
|
78
81
|
GenerateReqInput,
|
79
82
|
GetWeightsByNameReqInput,
|
83
|
+
InitWeightsSendGroupForRemoteInstanceReqInput,
|
80
84
|
InitWeightsUpdateGroupReqInput,
|
81
85
|
LoadLoRAAdapterReqInput,
|
82
86
|
OpenSessionReqInput,
|
@@ -84,6 +88,7 @@ from sglang.srt.managers.io_struct import (
|
|
84
88
|
ProfileReqInput,
|
85
89
|
ReleaseMemoryOccupationReqInput,
|
86
90
|
ResumeMemoryOccupationReqInput,
|
91
|
+
SendWeightsToRemoteInstanceReqInput,
|
87
92
|
SeparateReasoningReqInput,
|
88
93
|
SetInternalStateReq,
|
89
94
|
SlowDownReqInput,
|
@@ -95,9 +100,10 @@ from sglang.srt.managers.io_struct import (
|
|
95
100
|
VertexGenerateReqInput,
|
96
101
|
)
|
97
102
|
from sglang.srt.managers.multi_tokenizer_mixin import (
|
98
|
-
|
99
|
-
|
103
|
+
MultiTokenizerRouter,
|
104
|
+
TokenizerWorker,
|
100
105
|
get_main_process_id,
|
106
|
+
monkey_patch_uvicorn_multiprocessing,
|
101
107
|
read_from_shared_memory,
|
102
108
|
write_data_for_multi_tokenizer,
|
103
109
|
)
|
@@ -127,7 +133,7 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|
127
133
|
# Store global states
|
128
134
|
@dataclasses.dataclass
|
129
135
|
class _GlobalState:
|
130
|
-
tokenizer_manager: TokenizerManager
|
136
|
+
tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
|
131
137
|
template_manager: TemplateManager
|
132
138
|
scheduler_info: Dict
|
133
139
|
|
@@ -140,21 +146,6 @@ def set_global_state(global_state: _GlobalState):
|
|
140
146
|
_global_state = global_state
|
141
147
|
|
142
148
|
|
143
|
-
# Function to set up all middlewares for multi-tokenizer compatibility
|
144
|
-
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
145
|
-
"""Setup all middlewares for both single and multi-process modes"""
|
146
|
-
worker_pid = os.getpid()
|
147
|
-
|
148
|
-
if api_key:
|
149
|
-
add_api_key_middleware(app, api_key)
|
150
|
-
logger.info(f"Worker {worker_pid} added API key middleware")
|
151
|
-
|
152
|
-
if enable_metrics:
|
153
|
-
add_prometheus_middleware(app)
|
154
|
-
enable_func_timer()
|
155
|
-
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
156
|
-
|
157
|
-
|
158
149
|
async def init_multi_tokenizer() -> ServerArgs:
|
159
150
|
"""Read args information from shm and init tokenizer manager for current process"""
|
160
151
|
pid = os.getpid()
|
@@ -162,18 +153,22 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
162
153
|
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
163
154
|
|
164
155
|
# Read configuration from shared memory
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
156
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
157
|
+
f"multi_tokenizer_args_{main_pid}"
|
158
|
+
)
|
159
|
+
server_args: ServerArgs
|
160
|
+
|
161
|
+
# API key authentication is not supported in multi-tokenizer mode
|
162
|
+
assert (
|
163
|
+
server_args.api_key is None
|
164
|
+
), "API key is not supported in multi-tokenizer mode"
|
170
165
|
|
171
166
|
port_args.tokenizer_ipc_name = (
|
172
167
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
173
168
|
)
|
174
169
|
|
175
170
|
# Launch multi-tokenizer manager process
|
176
|
-
tokenizer_manager =
|
171
|
+
tokenizer_manager = TokenizerWorker(server_args, port_args)
|
177
172
|
template_manager = TemplateManager()
|
178
173
|
template_manager.initialize_templates(
|
179
174
|
tokenizer_manager=tokenizer_manager,
|
@@ -192,18 +187,29 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
192
187
|
scheduler_info=scheduler_info,
|
193
188
|
)
|
194
189
|
)
|
190
|
+
|
191
|
+
if server_args.enable_trace:
|
192
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
193
|
+
if server_args.disaggregation_mode == "null":
|
194
|
+
thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
|
195
|
+
trace_set_thread_info(thread_label)
|
196
|
+
|
195
197
|
return server_args
|
196
198
|
|
197
199
|
|
198
200
|
@asynccontextmanager
|
199
201
|
async def lifespan(fast_api_app: FastAPI):
|
200
|
-
|
201
|
-
if server_args is None:
|
202
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
202
203
|
# Initialize multi-tokenizer support for worker processes
|
203
|
-
fast_api_app.server_args = await init_multi_tokenizer()
|
204
|
-
|
205
|
-
|
206
|
-
)
|
204
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
205
|
+
|
206
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
207
|
+
worker_pid = os.getpid()
|
208
|
+
if fast_api_app.server_args.enable_metrics:
|
209
|
+
add_prometheus_middleware(app)
|
210
|
+
enable_func_timer()
|
211
|
+
|
212
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
207
213
|
fast_api_app.warmup_thread = threading.Thread(
|
208
214
|
target=_wait_and_warmup,
|
209
215
|
args=(
|
@@ -229,6 +235,12 @@ async def lifespan(fast_api_app: FastAPI):
|
|
229
235
|
fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
|
230
236
|
_global_state.tokenizer_manager
|
231
237
|
)
|
238
|
+
fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
|
239
|
+
_global_state.tokenizer_manager
|
240
|
+
)
|
241
|
+
fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
|
242
|
+
_global_state.tokenizer_manager
|
243
|
+
)
|
232
244
|
|
233
245
|
server_args: ServerArgs = fast_api_app.server_args
|
234
246
|
|
@@ -299,7 +311,23 @@ app.add_middleware(
|
|
299
311
|
|
300
312
|
@app.exception_handler(HTTPException)
|
301
313
|
async def validation_exception_handler(request: Request, exc: HTTPException):
|
302
|
-
"""Enrich HTTP exception with status code and other details
|
314
|
+
"""Enrich HTTP exception with status code and other details.
|
315
|
+
|
316
|
+
For /v1/responses, emit OpenAI-style nested error envelope:
|
317
|
+
{"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
|
318
|
+
"""
|
319
|
+
# adjust fmt for responses api
|
320
|
+
if request.url.path.startswith("/v1/responses"):
|
321
|
+
nested_error = {
|
322
|
+
"message": exc.detail,
|
323
|
+
"type": HTTPStatus(exc.status_code).phrase,
|
324
|
+
"param": None,
|
325
|
+
"code": exc.status_code,
|
326
|
+
}
|
327
|
+
return ORJSONResponse(
|
328
|
+
content={"error": nested_error}, status_code=exc.status_code
|
329
|
+
)
|
330
|
+
|
303
331
|
error = ErrorResponse(
|
304
332
|
object="error",
|
305
333
|
message=exc.detail,
|
@@ -312,7 +340,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
|
|
312
340
|
# Custom exception handlers to change validation error status codes
|
313
341
|
@app.exception_handler(RequestValidationError)
|
314
342
|
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
315
|
-
"""Override FastAPI's default 422 validation error with 400
|
343
|
+
"""Override FastAPI's default 422 validation error with 400.
|
344
|
+
|
345
|
+
For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
|
346
|
+
"""
|
316
347
|
exc_str = str(exc)
|
317
348
|
errors_str = str(exc.errors())
|
318
349
|
|
@@ -321,6 +352,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
|
|
321
352
|
else:
|
322
353
|
message = exc_str
|
323
354
|
|
355
|
+
if request.url.path.startswith("/v1/responses"):
|
356
|
+
# adapt specially, for v1/responses API only (notice the error key is different)
|
357
|
+
nested_error = {
|
358
|
+
"message": message,
|
359
|
+
"type": HTTPStatus.BAD_REQUEST.phrase,
|
360
|
+
"param": None,
|
361
|
+
"code": HTTPStatus.BAD_REQUEST.value,
|
362
|
+
}
|
363
|
+
return ORJSONResponse(status_code=400, content={"error": nested_error})
|
364
|
+
|
324
365
|
err = ErrorResponse(
|
325
366
|
message=message,
|
326
367
|
type=HTTPStatus.BAD_REQUEST.phrase,
|
@@ -465,7 +506,7 @@ async def get_load():
|
|
465
506
|
|
466
507
|
|
467
508
|
# example usage:
|
468
|
-
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"
|
509
|
+
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
|
469
510
|
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
470
511
|
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
471
512
|
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
@@ -681,6 +722,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
681
722
|
)
|
682
723
|
|
683
724
|
|
725
|
+
@app.post("/init_weights_send_group_for_remote_instance")
|
726
|
+
async def init_weights_send_group_for_remote_instance(
|
727
|
+
obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
|
728
|
+
):
|
729
|
+
success, message = (
|
730
|
+
await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
|
731
|
+
obj, request
|
732
|
+
)
|
733
|
+
)
|
734
|
+
content = {"success": success, "message": message}
|
735
|
+
if success:
|
736
|
+
return ORJSONResponse(content, status_code=200)
|
737
|
+
else:
|
738
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
739
|
+
|
740
|
+
|
741
|
+
@app.post("/send_weights_to_remote_instance")
|
742
|
+
async def send_weights_to_remote_instance(
|
743
|
+
obj: SendWeightsToRemoteInstanceReqInput, request: Request
|
744
|
+
):
|
745
|
+
success, message = (
|
746
|
+
await _global_state.tokenizer_manager.send_weights_to_remote_instance(
|
747
|
+
obj, request
|
748
|
+
)
|
749
|
+
)
|
750
|
+
content = {"success": success, "message": message}
|
751
|
+
if success:
|
752
|
+
return ORJSONResponse(content, status_code=200)
|
753
|
+
else:
|
754
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
755
|
+
|
756
|
+
|
684
757
|
@app.post("/init_weights_update_group")
|
685
758
|
async def init_weights_update_group(
|
686
759
|
obj: InitWeightsUpdateGroupReqInput, request: Request
|
@@ -696,6 +769,20 @@ async def init_weights_update_group(
|
|
696
769
|
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
697
770
|
|
698
771
|
|
772
|
+
@app.post("/destroy_weights_update_group")
|
773
|
+
async def destroy_weights_update_group(
|
774
|
+
obj: DestroyWeightsUpdateGroupReqInput, request: Request
|
775
|
+
):
|
776
|
+
"""Destroy the parameter update group."""
|
777
|
+
success, message = (
|
778
|
+
await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
|
779
|
+
)
|
780
|
+
content = {"success": success, "message": message}
|
781
|
+
return ORJSONResponse(
|
782
|
+
content, status_code=200 if success else HTTPStatus.BAD_REQUEST
|
783
|
+
)
|
784
|
+
|
785
|
+
|
699
786
|
@app.post("/update_weights_from_tensor")
|
700
787
|
async def update_weights_from_tensor(
|
701
788
|
obj: UpdateWeightsFromTensorReqInput, request: Request
|
@@ -995,6 +1082,42 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
|
|
995
1082
|
)
|
996
1083
|
|
997
1084
|
|
1085
|
+
@app.post(
|
1086
|
+
"/v1/tokenize",
|
1087
|
+
response_class=ORJSONResponse,
|
1088
|
+
dependencies=[Depends(validate_json_request)],
|
1089
|
+
)
|
1090
|
+
@app.post(
|
1091
|
+
"/tokenize",
|
1092
|
+
response_class=ORJSONResponse,
|
1093
|
+
dependencies=[Depends(validate_json_request)],
|
1094
|
+
include_in_schema=False,
|
1095
|
+
)
|
1096
|
+
async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
|
1097
|
+
"""OpenAI-compatible tokenization endpoint."""
|
1098
|
+
return await raw_request.app.state.openai_serving_tokenize.handle_request(
|
1099
|
+
request, raw_request
|
1100
|
+
)
|
1101
|
+
|
1102
|
+
|
1103
|
+
@app.post(
|
1104
|
+
"/v1/detokenize",
|
1105
|
+
response_class=ORJSONResponse,
|
1106
|
+
dependencies=[Depends(validate_json_request)],
|
1107
|
+
)
|
1108
|
+
@app.post(
|
1109
|
+
"/detokenize",
|
1110
|
+
response_class=ORJSONResponse,
|
1111
|
+
dependencies=[Depends(validate_json_request)],
|
1112
|
+
include_in_schema=False,
|
1113
|
+
)
|
1114
|
+
async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
|
1115
|
+
"""OpenAI-compatible detokenization endpoint."""
|
1116
|
+
return await raw_request.app.state.openai_serving_detokenize.handle_request(
|
1117
|
+
request, raw_request
|
1118
|
+
)
|
1119
|
+
|
1120
|
+
|
998
1121
|
@app.get("/v1/models", response_class=ORJSONResponse)
|
999
1122
|
async def available_models():
|
1000
1123
|
"""Show available models. OpenAI-compatible endpoint."""
|
@@ -1168,7 +1291,6 @@ def launch_server(
|
|
1168
1291
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1169
1292
|
"""
|
1170
1293
|
if server_args.tokenizer_worker_num > 1:
|
1171
|
-
setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
|
1172
1294
|
port_args = PortArgs.init_new(server_args)
|
1173
1295
|
port_args.tokenizer_worker_ipc_name = (
|
1174
1296
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
@@ -1177,11 +1299,16 @@ def launch_server(
|
|
1177
1299
|
server_args=server_args, port_args=port_args
|
1178
1300
|
)
|
1179
1301
|
else:
|
1180
|
-
setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
|
1181
1302
|
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1182
1303
|
server_args=server_args,
|
1183
1304
|
)
|
1184
1305
|
|
1306
|
+
if server_args.enable_trace:
|
1307
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
1308
|
+
if server_args.disaggregation_mode == "null":
|
1309
|
+
thread_label = "Tokenizer"
|
1310
|
+
trace_set_thread_info(thread_label)
|
1311
|
+
|
1185
1312
|
set_global_state(
|
1186
1313
|
_GlobalState(
|
1187
1314
|
tokenizer_manager=tokenizer_manager,
|
@@ -1191,12 +1318,10 @@ def launch_server(
|
|
1191
1318
|
)
|
1192
1319
|
|
1193
1320
|
if server_args.tokenizer_worker_num > 1:
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
scheduler_info,
|
1199
|
-
)
|
1321
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1322
|
+
port_args,
|
1323
|
+
server_args,
|
1324
|
+
scheduler_info,
|
1200
1325
|
)
|
1201
1326
|
else:
|
1202
1327
|
# Add api key authorization
|
@@ -1233,6 +1358,9 @@ def launch_server(
|
|
1233
1358
|
"level": "INFO",
|
1234
1359
|
"propagate": False,
|
1235
1360
|
}
|
1361
|
+
|
1362
|
+
monkey_patch_uvicorn_multiprocessing()
|
1363
|
+
|
1236
1364
|
uvicorn.run(
|
1237
1365
|
"sglang.srt.entrypoints.http_server:app",
|
1238
1366
|
host=server_args.host,
|
@@ -1243,6 +1371,7 @@ def launch_server(
|
|
1243
1371
|
workers=server_args.tokenizer_worker_num,
|
1244
1372
|
)
|
1245
1373
|
else:
|
1374
|
+
app.is_single_tokenizer_mode = True
|
1246
1375
|
uvicorn.run(
|
1247
1376
|
app,
|
1248
1377
|
host=server_args.host,
|
@@ -1253,10 +1382,8 @@ def launch_server(
|
|
1253
1382
|
)
|
1254
1383
|
finally:
|
1255
1384
|
if server_args.tokenizer_worker_num > 1:
|
1256
|
-
|
1257
|
-
|
1258
|
-
scheduler_info_shm.unlink()
|
1259
|
-
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1385
|
+
multi_tokenizer_args_shm.unlink()
|
1386
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1260
1387
|
else:
|
1261
1388
|
warmup_thread.join()
|
1262
1389
|
|
@@ -1405,13 +1532,5 @@ def _wait_and_warmup(
|
|
1405
1532
|
if server_args.debug_tensor_dump_input_file:
|
1406
1533
|
kill_process_tree(os.getpid())
|
1407
1534
|
|
1408
|
-
if server_args.pdlb_url is not None:
|
1409
|
-
register_disaggregation_server(
|
1410
|
-
server_args.disaggregation_mode,
|
1411
|
-
server_args.port,
|
1412
|
-
server_args.disaggregation_bootstrap_port,
|
1413
|
-
server_args.pdlb_url,
|
1414
|
-
)
|
1415
|
-
|
1416
1535
|
if launch_callback is not None:
|
1417
1536
|
launch_callback()
|