sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -11,32 +11,34 @@
|
|
11
11
|
# See the License for the specific language governing permissions and
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
|
-
"""
|
14
|
+
"""Mixin class and utils for multi-http-worker mode"""
|
15
15
|
import asyncio
|
16
|
-
import dataclasses
|
17
|
-
import json
|
18
16
|
import logging
|
19
17
|
import multiprocessing as multiprocessing
|
20
18
|
import os
|
19
|
+
import pickle
|
21
20
|
import sys
|
22
21
|
import threading
|
22
|
+
from functools import partialmethod
|
23
23
|
from multiprocessing import shared_memory
|
24
|
-
from typing import Dict
|
24
|
+
from typing import Any, Dict
|
25
25
|
|
26
26
|
import setproctitle
|
27
27
|
import zmq
|
28
28
|
import zmq.asyncio
|
29
29
|
|
30
30
|
from sglang.srt.disaggregation.utils import DisaggregationMode, TransferBackend
|
31
|
+
from sglang.srt.managers.disagg_service import start_disagg_service
|
31
32
|
from sglang.srt.managers.io_struct import (
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
BatchEmbeddingOutput,
|
34
|
+
BatchMultimodalOutput,
|
35
|
+
BatchStrOutput,
|
36
|
+
BatchTokenIDOutput,
|
36
37
|
MultiTokenizerRegisterReq,
|
37
|
-
|
38
|
+
MultiTokenizerWrapper,
|
38
39
|
)
|
39
|
-
from sglang.srt.managers.
|
40
|
+
from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator
|
41
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
40
42
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
41
43
|
from sglang.srt.utils import get_zmq_socket, kill_process_tree
|
42
44
|
from sglang.utils import get_exception_traceback
|
@@ -44,302 +46,304 @@ from sglang.utils import get_exception_traceback
|
|
44
46
|
logger = logging.getLogger(__name__)
|
45
47
|
|
46
48
|
|
47
|
-
class
|
48
|
-
|
49
|
+
class SocketMapping:
|
50
|
+
def __init__(self):
|
51
|
+
self._zmq_context = zmq.Context()
|
52
|
+
self._mapping: Dict[str, zmq.Socket] = {}
|
49
53
|
|
50
|
-
def
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if not hasattr(self, "_zmq_context"):
|
55
|
-
self._zmq_context = zmq.Context()
|
54
|
+
def clear_all_sockets(self):
|
55
|
+
for socket in self._mapping.values():
|
56
|
+
socket.close()
|
57
|
+
self._mapping.clear()
|
56
58
|
|
57
|
-
def
|
58
|
-
self, recv_obj: MultiTokenizerRegisterReq, worker_id: str
|
59
|
+
def register_ipc_mapping(
|
60
|
+
self, recv_obj: MultiTokenizerRegisterReq, worker_id: str, is_tokenizer: bool
|
59
61
|
):
|
60
|
-
""
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
if worker_id_int not in self.tokenizer_mapping:
|
65
|
-
socket = get_zmq_socket(self._zmq_context, zmq.PUSH, ipc_name, False)
|
66
|
-
self.tokenizer_mapping[worker_id_int] = socket
|
67
|
-
self.tokenizer_mapping[worker_id_int].send_pyobj(recv_obj)
|
68
|
-
return True
|
69
|
-
else:
|
70
|
-
return False
|
71
|
-
|
72
|
-
def register_tokenizer_ipc(self, recv_obj, worker_id):
|
73
|
-
if worker_id not in self.tokenizer_mapping:
|
74
|
-
# register the worker if not already done
|
75
|
-
if isinstance(recv_obj, MultiTokenizerRegisterReq):
|
76
|
-
return self.init_tokenizer_mapping(recv_obj, worker_id)
|
77
|
-
else:
|
78
|
-
logger.error(
|
79
|
-
f"Worker {worker_id} not registered and not found in tokenizer mapping . "
|
80
|
-
"Please ensure the worker is registered correctly."
|
81
|
-
)
|
82
|
-
return False
|
83
|
-
|
84
|
-
def _handle_output_by_index(self, output, i):
|
85
|
-
"""NOTE: A maintainable method is better here."""
|
86
|
-
if isinstance(output, BatchTokenIDOut):
|
87
|
-
new_output = BatchTokenIDOut(
|
88
|
-
rids=[output.rids[i]],
|
89
|
-
finished_reasons=(
|
90
|
-
[output.finished_reasons[i]]
|
91
|
-
if len(output.finished_reasons) > i
|
92
|
-
else None
|
93
|
-
),
|
94
|
-
decoded_texts=(
|
95
|
-
[output.decoded_texts[i]] if len(output.decoded_texts) > i else None
|
96
|
-
),
|
97
|
-
decode_ids=(
|
98
|
-
[output.decode_ids[i]] if len(output.decode_ids) > i else None
|
99
|
-
),
|
100
|
-
read_offsets=(
|
101
|
-
[output.read_offsets[i]] if len(output.read_offsets) > i else None
|
102
|
-
),
|
103
|
-
output_ids=(
|
104
|
-
[output.output_ids[i]]
|
105
|
-
if output.output_ids and len(output.output_ids) > i
|
106
|
-
else None
|
107
|
-
),
|
108
|
-
skip_special_tokens=(
|
109
|
-
[output.skip_special_tokens[i]]
|
110
|
-
if len(output.skip_special_tokens) > i
|
111
|
-
else None
|
112
|
-
),
|
113
|
-
spaces_between_special_tokens=(
|
114
|
-
[output.spaces_between_special_tokens[i]]
|
115
|
-
if len(output.spaces_between_special_tokens) > i
|
116
|
-
else None
|
117
|
-
),
|
118
|
-
no_stop_trim=(
|
119
|
-
[output.no_stop_trim[i]] if len(output.no_stop_trim) > i else None
|
120
|
-
),
|
121
|
-
prompt_tokens=(
|
122
|
-
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
123
|
-
),
|
124
|
-
completion_tokens=(
|
125
|
-
[output.completion_tokens[i]]
|
126
|
-
if len(output.completion_tokens) > i
|
127
|
-
else None
|
128
|
-
),
|
129
|
-
cached_tokens=(
|
130
|
-
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
131
|
-
),
|
132
|
-
spec_verify_ct=(
|
133
|
-
[output.spec_verify_ct[i]]
|
134
|
-
if len(output.spec_verify_ct) > i
|
135
|
-
else None
|
136
|
-
),
|
137
|
-
input_token_logprobs_val=(
|
138
|
-
[output.input_token_logprobs_val[i]]
|
139
|
-
if output.input_token_logprobs_val
|
140
|
-
else None
|
141
|
-
),
|
142
|
-
input_token_logprobs_idx=(
|
143
|
-
[output.input_token_logprobs_idx[i]]
|
144
|
-
if output.input_token_logprobs_idx
|
145
|
-
else None
|
146
|
-
),
|
147
|
-
output_token_logprobs_val=(
|
148
|
-
[output.output_token_logprobs_val[i]]
|
149
|
-
if output.output_token_logprobs_val
|
150
|
-
else None
|
151
|
-
),
|
152
|
-
output_token_logprobs_idx=(
|
153
|
-
[output.output_token_logprobs_idx[i]]
|
154
|
-
if output.output_token_logprobs_idx
|
155
|
-
else None
|
156
|
-
),
|
157
|
-
input_top_logprobs_val=(
|
158
|
-
[output.input_top_logprobs_val[i]]
|
159
|
-
if output.input_top_logprobs_val
|
160
|
-
else None
|
161
|
-
),
|
162
|
-
input_top_logprobs_idx=(
|
163
|
-
[output.input_top_logprobs_idx[i]]
|
164
|
-
if output.input_top_logprobs_idx
|
165
|
-
else None
|
166
|
-
),
|
167
|
-
output_top_logprobs_val=(
|
168
|
-
[output.output_top_logprobs_val[i]]
|
169
|
-
if output.output_top_logprobs_val
|
170
|
-
else None
|
171
|
-
),
|
172
|
-
output_top_logprobs_idx=(
|
173
|
-
[output.output_top_logprobs_idx[i]]
|
174
|
-
if output.output_top_logprobs_idx
|
175
|
-
else None
|
176
|
-
),
|
177
|
-
input_token_ids_logprobs_val=(
|
178
|
-
[output.input_token_ids_logprobs_val[i]]
|
179
|
-
if output.input_token_ids_logprobs_val
|
180
|
-
else None
|
181
|
-
),
|
182
|
-
input_token_ids_logprobs_idx=(
|
183
|
-
[output.input_token_ids_logprobs_idx[i]]
|
184
|
-
if output.input_token_ids_logprobs_idx
|
185
|
-
else None
|
186
|
-
),
|
187
|
-
output_token_ids_logprobs_val=(
|
188
|
-
[output.output_token_ids_logprobs_val[i]]
|
189
|
-
if output.output_token_ids_logprobs_val
|
190
|
-
else None
|
191
|
-
),
|
192
|
-
output_token_ids_logprobs_idx=(
|
193
|
-
[output.output_token_ids_logprobs_idx[i]]
|
194
|
-
if output.output_token_ids_logprobs_idx
|
195
|
-
else None
|
196
|
-
),
|
197
|
-
output_hidden_states=(
|
198
|
-
[output.output_hidden_states[i]]
|
199
|
-
if output.output_hidden_states
|
200
|
-
else None
|
201
|
-
),
|
202
|
-
)
|
203
|
-
elif isinstance(output, BatchEmbeddingOut):
|
204
|
-
new_output = BatchEmbeddingOut(
|
205
|
-
rids=[output.rids[i]],
|
206
|
-
finished_reasons=(
|
207
|
-
[output.finished_reasons[i]]
|
208
|
-
if len(output.finished_reasons) > i
|
209
|
-
else None
|
210
|
-
),
|
211
|
-
embeddings=(
|
212
|
-
[output.embeddings[i]] if len(output.embeddings) > i else None
|
213
|
-
),
|
214
|
-
prompt_tokens=(
|
215
|
-
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
216
|
-
),
|
217
|
-
cached_tokens=(
|
218
|
-
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
219
|
-
),
|
62
|
+
type_str = "tokenizer" if is_tokenizer else "detokenizer"
|
63
|
+
if worker_id in self._mapping:
|
64
|
+
logger.warning(
|
65
|
+
f"{type_str} already registered with worker {worker_id}, skipping..."
|
220
66
|
)
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
[output.output_ids[i]]
|
234
|
-
if output.output_ids and len(output.output_ids) > i
|
235
|
-
else None
|
236
|
-
),
|
237
|
-
prompt_tokens=(
|
238
|
-
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
239
|
-
),
|
240
|
-
completion_tokens=(
|
241
|
-
[output.completion_tokens[i]]
|
242
|
-
if len(output.completion_tokens) > i
|
243
|
-
else None
|
244
|
-
),
|
245
|
-
cached_tokens=(
|
246
|
-
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
247
|
-
),
|
248
|
-
spec_verify_ct=(
|
249
|
-
[output.spec_verify_ct[i]]
|
250
|
-
if len(output.spec_verify_ct) > i
|
251
|
-
else None
|
252
|
-
),
|
253
|
-
input_token_logprobs_val=(
|
254
|
-
[output.input_token_logprobs_val[i]]
|
255
|
-
if output.input_token_logprobs_val
|
256
|
-
else None
|
257
|
-
),
|
258
|
-
input_token_logprobs_idx=(
|
259
|
-
[output.input_token_logprobs_idx[i]]
|
260
|
-
if output.input_token_logprobs_idx
|
261
|
-
else None
|
262
|
-
),
|
263
|
-
output_token_logprobs_val=(
|
264
|
-
[output.output_token_logprobs_val[i]]
|
265
|
-
if output.output_token_logprobs_val
|
266
|
-
else None
|
267
|
-
),
|
268
|
-
output_token_logprobs_idx=(
|
269
|
-
[output.output_token_logprobs_idx[i]]
|
270
|
-
if output.output_token_logprobs_idx
|
271
|
-
else None
|
272
|
-
),
|
273
|
-
input_top_logprobs_val=(
|
274
|
-
[output.input_top_logprobs_val[i]]
|
275
|
-
if output.input_top_logprobs_val
|
276
|
-
else None
|
277
|
-
),
|
278
|
-
input_top_logprobs_idx=(
|
279
|
-
[output.input_top_logprobs_idx[i]]
|
280
|
-
if output.input_top_logprobs_idx
|
281
|
-
else None
|
282
|
-
),
|
283
|
-
output_top_logprobs_val=(
|
284
|
-
[output.output_top_logprobs_val[i]]
|
285
|
-
if output.output_top_logprobs_val
|
286
|
-
else None
|
287
|
-
),
|
288
|
-
output_top_logprobs_idx=(
|
289
|
-
[output.output_top_logprobs_idx[i]]
|
290
|
-
if output.output_top_logprobs_idx
|
291
|
-
else None
|
292
|
-
),
|
293
|
-
input_token_ids_logprobs_val=(
|
294
|
-
[output.input_token_ids_logprobs_val[i]]
|
295
|
-
if output.input_token_ids_logprobs_val
|
296
|
-
else None
|
297
|
-
),
|
298
|
-
input_token_ids_logprobs_idx=(
|
299
|
-
[output.input_token_ids_logprobs_idx[i]]
|
300
|
-
if output.input_token_ids_logprobs_idx
|
301
|
-
else None
|
302
|
-
),
|
303
|
-
output_token_ids_logprobs_val=(
|
304
|
-
[output.output_token_ids_logprobs_val[i]]
|
305
|
-
if output.output_token_ids_logprobs_val
|
306
|
-
else None
|
307
|
-
),
|
308
|
-
output_token_ids_logprobs_idx=(
|
309
|
-
[output.output_token_ids_logprobs_idx[i]]
|
310
|
-
if output.output_token_ids_logprobs_idx
|
311
|
-
else None
|
312
|
-
),
|
313
|
-
output_hidden_states=(
|
314
|
-
[output.output_hidden_states[i]]
|
315
|
-
if output.output_hidden_states
|
316
|
-
else None
|
317
|
-
),
|
318
|
-
)
|
319
|
-
elif isinstance(output, BatchMultimodalOut):
|
320
|
-
new_output = BatchMultimodalOut(
|
321
|
-
rids=[output.rids[i]],
|
322
|
-
finished_reasons=(
|
323
|
-
[output.finished_reasons[i]]
|
324
|
-
if len(output.finished_reasons) > i
|
325
|
-
else None
|
326
|
-
),
|
327
|
-
outputs=([output.outputs[i]] if len(output.outputs) > i else None),
|
328
|
-
prompt_tokens=(
|
329
|
-
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
330
|
-
),
|
331
|
-
completion_tokens=(
|
332
|
-
[output.completion_tokens[i]]
|
333
|
-
if len(output.completion_tokens) > i
|
334
|
-
else None
|
335
|
-
),
|
336
|
-
cached_tokens=(
|
337
|
-
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
338
|
-
),
|
67
|
+
return
|
68
|
+
logger.info(
|
69
|
+
f"{type_str} not registered with worker {worker_id}, registering..."
|
70
|
+
)
|
71
|
+
socket = get_zmq_socket(self._zmq_context, zmq.PUSH, recv_obj.ipc_name, False)
|
72
|
+
self._mapping[worker_id] = socket
|
73
|
+
self._mapping[worker_id].send_pyobj(recv_obj)
|
74
|
+
|
75
|
+
def send_output(self, worker_id: str, output: Any):
|
76
|
+
if worker_id not in self._mapping:
|
77
|
+
logger.error(
|
78
|
+
f"worker ID {worker_id} not registered. Check if the server Process is alive"
|
339
79
|
)
|
340
|
-
|
341
|
-
|
342
|
-
|
80
|
+
return
|
81
|
+
self._mapping[worker_id].send_pyobj(output)
|
82
|
+
|
83
|
+
|
84
|
+
def _handle_output_by_index(output, i):
|
85
|
+
"""NOTE: A maintainable method is better here."""
|
86
|
+
if isinstance(output, BatchTokenIDOutput):
|
87
|
+
new_output = BatchTokenIDOutput(
|
88
|
+
rids=[output.rids[i]],
|
89
|
+
finished_reasons=(
|
90
|
+
[output.finished_reasons[i]]
|
91
|
+
if len(output.finished_reasons) > i
|
92
|
+
else None
|
93
|
+
),
|
94
|
+
decoded_texts=(
|
95
|
+
[output.decoded_texts[i]] if len(output.decoded_texts) > i else None
|
96
|
+
),
|
97
|
+
decode_ids=([output.decode_ids[i]] if len(output.decode_ids) > i else None),
|
98
|
+
read_offsets=(
|
99
|
+
[output.read_offsets[i]] if len(output.read_offsets) > i else None
|
100
|
+
),
|
101
|
+
output_ids=(
|
102
|
+
[output.output_ids[i]]
|
103
|
+
if output.output_ids and len(output.output_ids) > i
|
104
|
+
else None
|
105
|
+
),
|
106
|
+
skip_special_tokens=(
|
107
|
+
[output.skip_special_tokens[i]]
|
108
|
+
if len(output.skip_special_tokens) > i
|
109
|
+
else None
|
110
|
+
),
|
111
|
+
spaces_between_special_tokens=(
|
112
|
+
[output.spaces_between_special_tokens[i]]
|
113
|
+
if len(output.spaces_between_special_tokens) > i
|
114
|
+
else None
|
115
|
+
),
|
116
|
+
no_stop_trim=(
|
117
|
+
[output.no_stop_trim[i]] if len(output.no_stop_trim) > i else None
|
118
|
+
),
|
119
|
+
prompt_tokens=(
|
120
|
+
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
121
|
+
),
|
122
|
+
completion_tokens=(
|
123
|
+
[output.completion_tokens[i]]
|
124
|
+
if len(output.completion_tokens) > i
|
125
|
+
else None
|
126
|
+
),
|
127
|
+
cached_tokens=(
|
128
|
+
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
129
|
+
),
|
130
|
+
spec_verify_ct=(
|
131
|
+
[output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
|
132
|
+
),
|
133
|
+
input_token_logprobs_val=(
|
134
|
+
[output.input_token_logprobs_val[i]]
|
135
|
+
if output.input_token_logprobs_val
|
136
|
+
else None
|
137
|
+
),
|
138
|
+
input_token_logprobs_idx=(
|
139
|
+
[output.input_token_logprobs_idx[i]]
|
140
|
+
if output.input_token_logprobs_idx
|
141
|
+
else None
|
142
|
+
),
|
143
|
+
output_token_logprobs_val=(
|
144
|
+
[output.output_token_logprobs_val[i]]
|
145
|
+
if output.output_token_logprobs_val
|
146
|
+
else None
|
147
|
+
),
|
148
|
+
output_token_logprobs_idx=(
|
149
|
+
[output.output_token_logprobs_idx[i]]
|
150
|
+
if output.output_token_logprobs_idx
|
151
|
+
else None
|
152
|
+
),
|
153
|
+
input_top_logprobs_val=(
|
154
|
+
[output.input_top_logprobs_val[i]]
|
155
|
+
if output.input_top_logprobs_val
|
156
|
+
else None
|
157
|
+
),
|
158
|
+
input_top_logprobs_idx=(
|
159
|
+
[output.input_top_logprobs_idx[i]]
|
160
|
+
if output.input_top_logprobs_idx
|
161
|
+
else None
|
162
|
+
),
|
163
|
+
output_top_logprobs_val=(
|
164
|
+
[output.output_top_logprobs_val[i]]
|
165
|
+
if output.output_top_logprobs_val
|
166
|
+
else None
|
167
|
+
),
|
168
|
+
output_top_logprobs_idx=(
|
169
|
+
[output.output_top_logprobs_idx[i]]
|
170
|
+
if output.output_top_logprobs_idx
|
171
|
+
else None
|
172
|
+
),
|
173
|
+
input_token_ids_logprobs_val=(
|
174
|
+
[output.input_token_ids_logprobs_val[i]]
|
175
|
+
if output.input_token_ids_logprobs_val
|
176
|
+
else None
|
177
|
+
),
|
178
|
+
input_token_ids_logprobs_idx=(
|
179
|
+
[output.input_token_ids_logprobs_idx[i]]
|
180
|
+
if output.input_token_ids_logprobs_idx
|
181
|
+
else None
|
182
|
+
),
|
183
|
+
output_token_ids_logprobs_val=(
|
184
|
+
[output.output_token_ids_logprobs_val[i]]
|
185
|
+
if output.output_token_ids_logprobs_val
|
186
|
+
else None
|
187
|
+
),
|
188
|
+
output_token_ids_logprobs_idx=(
|
189
|
+
[output.output_token_ids_logprobs_idx[i]]
|
190
|
+
if output.output_token_ids_logprobs_idx
|
191
|
+
else None
|
192
|
+
),
|
193
|
+
output_hidden_states=(
|
194
|
+
[output.output_hidden_states[i]]
|
195
|
+
if output.output_hidden_states
|
196
|
+
else None
|
197
|
+
),
|
198
|
+
placeholder_tokens_idx=None,
|
199
|
+
placeholder_tokens_val=None,
|
200
|
+
)
|
201
|
+
elif isinstance(output, BatchEmbeddingOutput):
|
202
|
+
new_output = BatchEmbeddingOutput(
|
203
|
+
rids=[output.rids[i]],
|
204
|
+
finished_reasons=(
|
205
|
+
[output.finished_reasons[i]]
|
206
|
+
if len(output.finished_reasons) > i
|
207
|
+
else None
|
208
|
+
),
|
209
|
+
embeddings=([output.embeddings[i]] if len(output.embeddings) > i else None),
|
210
|
+
prompt_tokens=(
|
211
|
+
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
212
|
+
),
|
213
|
+
cached_tokens=(
|
214
|
+
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
215
|
+
),
|
216
|
+
placeholder_tokens_idx=None,
|
217
|
+
placeholder_tokens_val=None,
|
218
|
+
)
|
219
|
+
elif isinstance(output, BatchStrOutput):
|
220
|
+
new_output = BatchStrOutput(
|
221
|
+
rids=[output.rids[i]],
|
222
|
+
finished_reasons=(
|
223
|
+
[output.finished_reasons[i]]
|
224
|
+
if len(output.finished_reasons) > i
|
225
|
+
else None
|
226
|
+
),
|
227
|
+
output_strs=(
|
228
|
+
[output.output_strs[i]] if len(output.output_strs) > i else None
|
229
|
+
),
|
230
|
+
output_ids=(
|
231
|
+
[output.output_ids[i]]
|
232
|
+
if output.output_ids and len(output.output_ids) > i
|
233
|
+
else None
|
234
|
+
),
|
235
|
+
prompt_tokens=(
|
236
|
+
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
237
|
+
),
|
238
|
+
completion_tokens=(
|
239
|
+
[output.completion_tokens[i]]
|
240
|
+
if len(output.completion_tokens) > i
|
241
|
+
else None
|
242
|
+
),
|
243
|
+
cached_tokens=(
|
244
|
+
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
245
|
+
),
|
246
|
+
spec_verify_ct=(
|
247
|
+
[output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
|
248
|
+
),
|
249
|
+
input_token_logprobs_val=(
|
250
|
+
[output.input_token_logprobs_val[i]]
|
251
|
+
if output.input_token_logprobs_val
|
252
|
+
else None
|
253
|
+
),
|
254
|
+
input_token_logprobs_idx=(
|
255
|
+
[output.input_token_logprobs_idx[i]]
|
256
|
+
if output.input_token_logprobs_idx
|
257
|
+
else None
|
258
|
+
),
|
259
|
+
output_token_logprobs_val=(
|
260
|
+
[output.output_token_logprobs_val[i]]
|
261
|
+
if output.output_token_logprobs_val
|
262
|
+
else None
|
263
|
+
),
|
264
|
+
output_token_logprobs_idx=(
|
265
|
+
[output.output_token_logprobs_idx[i]]
|
266
|
+
if output.output_token_logprobs_idx
|
267
|
+
else None
|
268
|
+
),
|
269
|
+
input_top_logprobs_val=(
|
270
|
+
[output.input_top_logprobs_val[i]]
|
271
|
+
if output.input_top_logprobs_val
|
272
|
+
else None
|
273
|
+
),
|
274
|
+
input_top_logprobs_idx=(
|
275
|
+
[output.input_top_logprobs_idx[i]]
|
276
|
+
if output.input_top_logprobs_idx
|
277
|
+
else None
|
278
|
+
),
|
279
|
+
output_top_logprobs_val=(
|
280
|
+
[output.output_top_logprobs_val[i]]
|
281
|
+
if output.output_top_logprobs_val
|
282
|
+
else None
|
283
|
+
),
|
284
|
+
output_top_logprobs_idx=(
|
285
|
+
[output.output_top_logprobs_idx[i]]
|
286
|
+
if output.output_top_logprobs_idx
|
287
|
+
else None
|
288
|
+
),
|
289
|
+
input_token_ids_logprobs_val=(
|
290
|
+
[output.input_token_ids_logprobs_val[i]]
|
291
|
+
if output.input_token_ids_logprobs_val
|
292
|
+
else None
|
293
|
+
),
|
294
|
+
input_token_ids_logprobs_idx=(
|
295
|
+
[output.input_token_ids_logprobs_idx[i]]
|
296
|
+
if output.input_token_ids_logprobs_idx
|
297
|
+
else None
|
298
|
+
),
|
299
|
+
output_token_ids_logprobs_val=(
|
300
|
+
[output.output_token_ids_logprobs_val[i]]
|
301
|
+
if output.output_token_ids_logprobs_val
|
302
|
+
else None
|
303
|
+
),
|
304
|
+
output_token_ids_logprobs_idx=(
|
305
|
+
[output.output_token_ids_logprobs_idx[i]]
|
306
|
+
if output.output_token_ids_logprobs_idx
|
307
|
+
else None
|
308
|
+
),
|
309
|
+
output_hidden_states=(
|
310
|
+
[output.output_hidden_states[i]]
|
311
|
+
if output.output_hidden_states
|
312
|
+
else None
|
313
|
+
),
|
314
|
+
placeholder_tokens_idx=None,
|
315
|
+
placeholder_tokens_val=None,
|
316
|
+
)
|
317
|
+
elif isinstance(output, BatchMultimodalOutput):
|
318
|
+
new_output = BatchMultimodalOutput(
|
319
|
+
rids=[output.rids[i]],
|
320
|
+
finished_reasons=(
|
321
|
+
[output.finished_reasons[i]]
|
322
|
+
if len(output.finished_reasons) > i
|
323
|
+
else None
|
324
|
+
),
|
325
|
+
outputs=([output.outputs[i]] if len(output.outputs) > i else None),
|
326
|
+
prompt_tokens=(
|
327
|
+
[output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
|
328
|
+
),
|
329
|
+
completion_tokens=(
|
330
|
+
[output.completion_tokens[i]]
|
331
|
+
if len(output.completion_tokens) > i
|
332
|
+
else None
|
333
|
+
),
|
334
|
+
cached_tokens=(
|
335
|
+
[output.cached_tokens[i]] if len(output.cached_tokens) > i else None
|
336
|
+
),
|
337
|
+
placeholder_tokens_idx=None,
|
338
|
+
placeholder_tokens_val=None,
|
339
|
+
)
|
340
|
+
else:
|
341
|
+
new_output = output
|
342
|
+
return new_output
|
343
|
+
|
344
|
+
|
345
|
+
class MultiHttpWorkerDetokenizerMixin:
|
346
|
+
"""Mixin class for DetokenizerManager"""
|
343
347
|
|
344
348
|
def get_worker_ids_from_req_rids(self, rids):
|
345
349
|
if isinstance(rids, list):
|
@@ -350,9 +354,13 @@ class MultiTokenizerMixin:
|
|
350
354
|
worker_ids = []
|
351
355
|
return worker_ids
|
352
356
|
|
353
|
-
def
|
354
|
-
|
355
|
-
|
357
|
+
def maybe_clear_socket_mapping(self):
|
358
|
+
if hasattr(self, "socket_mapping"):
|
359
|
+
self.socket_mapping.clear_all_sockets()
|
360
|
+
|
361
|
+
def multi_http_worker_event_loop(self):
|
362
|
+
"""The event loop that handles requests, for multi multi-http-worker mode"""
|
363
|
+
self.socket_mapping = SocketMapping()
|
356
364
|
while True:
|
357
365
|
recv_obj = self.recv_from_scheduler.recv_pyobj()
|
358
366
|
output = self._request_dispatcher(recv_obj)
|
@@ -369,32 +377,16 @@ class MultiTokenizerMixin:
|
|
369
377
|
# Send data using the corresponding socket
|
370
378
|
for i, worker_id in enumerate(worker_ids):
|
371
379
|
if isinstance(recv_obj, MultiTokenizerRegisterReq):
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
)
|
376
|
-
continue
|
380
|
+
self.socket_mapping.register_ipc_mapping(
|
381
|
+
recv_obj, worker_id, is_tokenizer=False
|
382
|
+
)
|
377
383
|
else:
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
self.tokenizer_mapping[worker_id].send_pyobj(new_output)
|
385
|
-
|
386
|
-
def clear_tokenizer_mapping(self):
|
387
|
-
if hasattr(self, "tokenizer_mapping"):
|
388
|
-
for socket in self.tokenizer_mapping.values():
|
389
|
-
try:
|
390
|
-
socket.close()
|
391
|
-
except Exception as e:
|
392
|
-
logger.warning(f"Failed to close socket: {e}")
|
393
|
-
self.tokenizer_mapping.clear()
|
394
|
-
|
395
|
-
|
396
|
-
class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
|
397
|
-
"""A router to receive requests from MultiTokenizerManager"""
|
384
|
+
new_output = _handle_output_by_index(output, i)
|
385
|
+
self.socket_mapping.send_output(worker_id, new_output)
|
386
|
+
|
387
|
+
|
388
|
+
class MultiTokenizerRouter:
|
389
|
+
"""A router to receive requests from TokenizerWorker"""
|
398
390
|
|
399
391
|
def __init__(
|
400
392
|
self,
|
@@ -422,7 +414,7 @@ class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
|
|
422
414
|
self._handle_task = asyncio.run_coroutine_threadsafe(
|
423
415
|
print_exception_wrapper(self.handle_loop), self._loop
|
424
416
|
)
|
425
|
-
self.
|
417
|
+
self.disaggregation_bootstrap_server = start_disagg_service(self.server_args)
|
426
418
|
|
427
419
|
def _run_loop(self):
|
428
420
|
self._loop.run_forever()
|
@@ -434,14 +426,14 @@ class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
|
|
434
426
|
|
435
427
|
async def handle_loop(self):
|
436
428
|
# special reqs will recv from scheduler, need to route to right worker
|
437
|
-
self.
|
429
|
+
self.socket_mapping = SocketMapping()
|
438
430
|
while True:
|
439
431
|
recv_obj = await self.recv_from_detokenizer.recv_pyobj()
|
440
432
|
await self._distribute_result_to_workers(recv_obj)
|
441
433
|
|
442
434
|
async def _distribute_result_to_workers(self, recv_obj):
|
443
435
|
"""Distribute result to corresponding workers based on rid"""
|
444
|
-
if isinstance(recv_obj,
|
436
|
+
if isinstance(recv_obj, MultiTokenizerWrapper):
|
445
437
|
worker_ids = [recv_obj.worker_id]
|
446
438
|
recv_obj = recv_obj.obj
|
447
439
|
else:
|
@@ -454,32 +446,23 @@ class MultiTokenizerRouter(TokenizerManager, MultiTokenizerMixin):
|
|
454
446
|
# Distribute result to each worker
|
455
447
|
for i, worker_id in enumerate(worker_ids):
|
456
448
|
if isinstance(recv_obj, MultiTokenizerRegisterReq):
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
)
|
461
|
-
continue
|
449
|
+
self.socket_mapping.register_ipc_mapping(
|
450
|
+
recv_obj, worker_id, is_tokenizer=True
|
451
|
+
)
|
462
452
|
else:
|
463
|
-
|
464
|
-
|
465
|
-
f"Tokenizer Worker ID {worker_id} not registered. Check if the server Process {worker_id} is alive"
|
466
|
-
)
|
467
|
-
continue
|
468
|
-
new_recv_obj = self._handle_output_by_index(recv_obj, i)
|
469
|
-
self.tokenizer_mapping[worker_id].send_pyobj(new_recv_obj)
|
453
|
+
new_recv_obj = _handle_output_by_index(recv_obj, i)
|
454
|
+
self.socket_mapping.send_output(worker_id, new_recv_obj)
|
470
455
|
|
471
456
|
|
472
|
-
class
|
473
|
-
"""
|
457
|
+
class TokenizerWorker(TokenizerManager):
|
458
|
+
"""Tokenizer Worker in multi-http-worker mode"""
|
474
459
|
|
475
460
|
def __init__(
|
476
461
|
self,
|
477
462
|
server_args: ServerArgs,
|
478
463
|
port_args: PortArgs,
|
479
464
|
):
|
480
|
-
setproctitle.setproctitle(
|
481
|
-
f"sglang::http_server/multi_tokenizer_manager:{os.getpid()}"
|
482
|
-
)
|
465
|
+
setproctitle.setproctitle(f"sglang::tokenizer_worker:{os.getpid()}")
|
483
466
|
# prevent init prefill bootstrapserver again
|
484
467
|
disaggregation_mode = server_args.disaggregation_mode
|
485
468
|
server_args.disaggregation_mode = "null"
|
@@ -535,42 +518,14 @@ async def print_exception_wrapper(func):
|
|
535
518
|
sys.exit(1)
|
536
519
|
|
537
520
|
|
538
|
-
def
|
539
|
-
"""
|
540
|
-
return
|
541
|
-
"tokenizer_ipc_name": port_args.tokenizer_ipc_name,
|
542
|
-
"scheduler_input_ipc_name": port_args.scheduler_input_ipc_name,
|
543
|
-
"detokenizer_ipc_name": port_args.detokenizer_ipc_name,
|
544
|
-
"nccl_port": port_args.nccl_port,
|
545
|
-
"rpc_ipc_name": port_args.rpc_ipc_name,
|
546
|
-
"metrics_ipc_name": port_args.metrics_ipc_name,
|
547
|
-
"tokenizer_worker_ipc_name": port_args.tokenizer_worker_ipc_name,
|
548
|
-
}
|
549
|
-
|
550
|
-
|
551
|
-
def deserialize_data(port_args: dict, server_args: dict):
|
552
|
-
"""Deserialize data from shared dictionaries"""
|
553
|
-
return PortArgs(**port_args), ServerArgs(**server_args)
|
554
|
-
|
555
|
-
|
556
|
-
def serialize_server_args(server_args: ServerArgs) -> dict:
|
557
|
-
"""Serialize ServerArgs into a shareable dictionary"""
|
558
|
-
return dataclasses.asdict(server_args)
|
559
|
-
|
560
|
-
|
561
|
-
def serialize_scheduler_info(scheduler_info: Dict) -> dict:
|
562
|
-
"""Serialize scheduler_info into a shareable dictionary"""
|
563
|
-
return scheduler_info
|
564
|
-
|
565
|
-
|
566
|
-
def deserialize_scheduler_info(data: dict) -> Dict:
|
567
|
-
"""Deserialize scheduler_info from a shared dictionary"""
|
568
|
-
return data
|
521
|
+
def get_main_process_id() -> int:
|
522
|
+
"""Get the main process ID"""
|
523
|
+
return multiprocessing.current_process()._parent_pid
|
569
524
|
|
570
525
|
|
571
|
-
def write_to_shared_memory(
|
526
|
+
def write_to_shared_memory(obj, name: str) -> shared_memory.SharedMemory:
|
572
527
|
"""Write data to shared memory"""
|
573
|
-
serialized =
|
528
|
+
serialized = pickle.dumps(obj)
|
574
529
|
size = len(serialized)
|
575
530
|
try:
|
576
531
|
# Try to open existing shared memory
|
@@ -588,22 +543,17 @@ def write_to_shared_memory(data: dict, name: str) -> shared_memory.SharedMemory:
|
|
588
543
|
return shm
|
589
544
|
|
590
545
|
|
591
|
-
def read_from_shared_memory(name: str) ->
|
546
|
+
def read_from_shared_memory(name: str) -> Any:
|
592
547
|
"""Read data from shared memory"""
|
593
548
|
try:
|
594
549
|
shm = shared_memory.SharedMemory(name=name)
|
595
|
-
data =
|
550
|
+
data = pickle.loads(bytes(shm.buf))
|
596
551
|
shm.close()
|
597
552
|
return data
|
598
553
|
except FileNotFoundError:
|
599
554
|
raise FileNotFoundError(f"Shared memory {name} not found")
|
600
555
|
|
601
556
|
|
602
|
-
def get_main_process_id() -> int:
|
603
|
-
"""Get the main process ID"""
|
604
|
-
return multiprocessing.current_process()._parent_pid
|
605
|
-
|
606
|
-
|
607
557
|
def write_data_for_multi_tokenizer(
|
608
558
|
port_args: PortArgs, server_args: ServerArgs, scheduler_info: Dict
|
609
559
|
):
|
@@ -612,22 +562,22 @@ def write_data_for_multi_tokenizer(
|
|
612
562
|
main_pid = get_main_process_id()
|
613
563
|
current_pid = os.getpid()
|
614
564
|
logger.info(f"main process ID: {main_pid}, current process ID: {current_pid}")
|
565
|
+
args = (port_args, server_args, scheduler_info)
|
566
|
+
args_shm = write_to_shared_memory(args, f"multi_tokenizer_args_{current_pid}")
|
567
|
+
args_shm.close()
|
615
568
|
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
scheduler_info_shm.close()
|
632
|
-
|
633
|
-
return port_args_shm, server_args_shm, scheduler_info_shm
|
569
|
+
return args_shm
|
570
|
+
|
571
|
+
|
572
|
+
def monkey_patch_uvicorn_multiprocessing(timeout: float = 10):
|
573
|
+
"""Monkey patch uvicorn multiprocessing is_alive timeout"""
|
574
|
+
# from default 5s -> 10s
|
575
|
+
try:
|
576
|
+
from uvicorn.supervisors.multiprocess import Process
|
577
|
+
|
578
|
+
Process.is_alive = partialmethod(Process.is_alive, timeout=timeout)
|
579
|
+
|
580
|
+
except ImportError:
|
581
|
+
logger.warning(
|
582
|
+
"uvicorn.supervisors.multiprocess not found, skipping monkey patch"
|
583
|
+
)
|