PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -18,6 +18,7 @@ processes (TokenizerManager, DetokenizerManager, Scheduler).
 import copy
 import uuid
+from abc import ABC
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
@@ -35,6 +36,32 @@ else:
     Image = Any
+@dataclass
+class BaseReq(ABC):
+    rid: Optional[Union[str, List[str]]] = field(default=None, kw_only=True)
+    http_worker_ipc: Optional[str] = field(default=None, kw_only=True)
+    def regenerate_rid(self):
+        """Generate a new request ID and return it."""
+        if isinstance(self.rid, list):
+            self.rid = [uuid.uuid4().hex for _ in range(len(self.rid))]
+        else:
+            self.rid = uuid.uuid4().hex
+        return self.rid
+@dataclass
+class BaseBatchReq(ABC):
+    rids: Optional[List[str]] = field(default=None, kw_only=True)
+    http_worker_ipcs: Optional[List[str]] = field(default=None, kw_only=True)
+    def regenerate_rids(self):
+        """Generate new request IDs and return them."""
+        self.rids = [uuid.uuid4().hex for _ in range(len(self.rids))]
+        return self.rids
+# Parameters for a session
 @dataclass
 class SessionParams:
     id: Optional[str] = None
@@ -62,7 +89,7 @@ MultimodalDataInputFormat = Union[
 @dataclass
-class GenerateReqInput:
+class GenerateReqInput(BaseReq):
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[str], str]] = None
     # The token ids for text; one can specify either text or input_ids
@@ -82,8 +109,6 @@ class GenerateReqInput:
     audio_data: Optional[MultimodalDataInputFormat] = None
     # The sampling_params. See descriptions below.
     sampling_params: Optional[Union[List[Dict], Dict]] = None
-    # The request id.
-    rid: Optional[Union[List[str], str]] = None
     # Whether to return logprobs.
     return_logprob: Optional[Union[List[bool], bool]] = None
     # If return logprobs, the start location in the prompt for returning logprobs.
@@ -132,17 +157,23 @@ class GenerateReqInput:
     # Conversation id used for tracking requests
     conversation_id: Optional[str] = None
-    # Label for the request
-    label: Optional[str] = None
     # Priority for the request
     priority: Optional[int] = None
-    # Image gen grpc migration
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[Union[List[str], str]] = None
+    # Whether to disallow logging for this request (e.g. due to ZDR)
+    no_logs: bool = False
+    # For custom metric labels
+    custom_labels: Optional[Dict[str, str]] = None
+    # (Internal) Whether to return bytes for image generation
     return_bytes: bool = False
-    # For customer metric labels
-    customer_labels: Optional[Dict[str, str]] = None
+    # Whether to return entropy
+    return_entropy: bool = False
     def contains_mm_input(self) -> bool:
         return (
@@ -485,11 +516,6 @@ class GenerateReqInput:
             ):
                 raise ValueError("Session params must be a dict or a list of dicts.")
-    def regenerate_rid(self):
-        """Generate a new request ID and return it."""
-        self.rid = uuid.uuid4().hex
-        return self.rid
     def __getitem__(self, i):
         return GenerateReqInput(
             text=self.text[i] if self.text is not None else None,
@@ -542,16 +568,17 @@ class GenerateReqInput:
                 self.data_parallel_rank if self.data_parallel_rank is not None else None
             ),
             conversation_id=self.conversation_id,
-            label=self.label,
             priority=self.priority,
+            extra_key=self.extra_key,
+            no_logs=self.no_logs,
+            custom_labels=self.custom_labels,
             return_bytes=self.return_bytes,
+            return_entropy=self.return_entropy,
         )
 @dataclass
-class TokenizedGenerateReqInput:
-    # The request id
-    rid: str
+class TokenizedGenerateReqInput(BaseReq):
     # The input text
     input_text: str
     # The input token ids
@@ -570,6 +597,7 @@ class TokenizedGenerateReqInput:
     token_ids_logprob: List[int]
     # Whether to stream output
     stream: bool
     # Whether to return hidden states
     return_hidden_states: bool = False
@@ -596,24 +624,27 @@ class TokenizedGenerateReqInput:
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
-    # For dp balance
-    dp_balance_id: int = -1
-    # Label for the request
-    label: Optional[str] = None
     # Priority for the request
     priority: Optional[int] = None
-    # Image gen grpc migration
-    return_bytes: bool = False
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[str] = None
+    # Whether to disallow logging for this request (e.g. due to ZDR)
+    no_logs: bool = False
     # tracing context
     trace_context: Optional[Dict] = None
+    # (Internal) Whether to return bytes for image generation
+    return_bytes: bool = False
+    # Whether to return entropy
+    return_entropy: bool = False
 @dataclass
-class BatchTokenizedGenerateReqInput:
+class BatchTokenizedGenerateReqInput(BaseBatchReq):
     # The batch of tokenized requests
     batch: List[TokenizedGenerateReqInput]
@@ -628,7 +659,7 @@ class BatchTokenizedGenerateReqInput:
 @dataclass
-class EmbeddingReqInput:
+class EmbeddingReqInput(BaseReq):
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[List[str]], List[str], str]] = None
     # The image input. It can be an image instance, file name, URL, or base64 encoded string.
@@ -644,8 +675,6 @@ class EmbeddingReqInput:
     audio_data: Optional[MultimodalDataInputFormat] = None
     # The token ids for text; one can either specify text or input_ids.
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
-    # The request id.
-    rid: Optional[Union[List[str], str]] = None
     # Dummy sampling params for compatibility
     sampling_params: Optional[Union[List[Dict], Dict]] = None
     # Dummy input embeds for compatibility
@@ -656,6 +685,8 @@ class EmbeddingReqInput:
     modalities: Optional[List[str]] = None
     # For cross-encoder requests
     is_cross_encoder_request: bool = False
+    # Priority for the request
+    priority: Optional[int] = None
     # For background responses (OpenAI responses API)
     background: bool = False
@@ -714,10 +745,6 @@ class EmbeddingReqInput:
             for i in range(self.batch_size):
                 self.sampling_params[i]["max_new_tokens"] = 0
-    def regenerate_rid(self):
-        self.rid = uuid.uuid4().hex
-        return self.rid
     def contains_mm_input(self) -> bool:
         return (
             has_valid_data(self.image_data)
@@ -746,9 +773,7 @@ class EmbeddingReqInput:
 @dataclass
-class TokenizedEmbeddingReqInput:
-    # The request id
-    rid: str
+class TokenizedEmbeddingReqInput(BaseReq):
     # The input text
     input_text: str
     # The input token ids
@@ -761,12 +786,12 @@ class TokenizedEmbeddingReqInput:
     sampling_params: SamplingParams
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
-    # For dp balance
-    dp_balance_id: int = -1
+    # Priority for the request
+    priority: Optional[int] = None
 @dataclass
-class BatchTokenizedEmbeddingReqInput:
+class BatchTokenizedEmbeddingReqInput(BaseBatchReq):
     # The batch of tokenized embedding requests
     batch: List[TokenizedEmbeddingReqInput]
@@ -781,9 +806,7 @@ class BatchTokenizedEmbeddingReqInput:
 @dataclass
-class BatchTokenIDOut:
-    # The request id
-    rids: List[str]
+class BatchTokenIDOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[BaseFinishReason]
     # For incremental decoding
@@ -802,6 +825,7 @@ class BatchTokenIDOut:
     completion_tokens: List[int]
     cached_tokens: List[int]
     spec_verify_ct: List[int]
+    spec_accepted_tokens: List[int]
     # Logprobs
     input_token_logprobs_val: List[float]
@@ -816,6 +840,7 @@ class BatchTokenIDOut:
     input_token_ids_logprobs_idx: List[List]
     output_token_ids_logprobs_val: List[List]
     output_token_ids_logprobs_idx: List[List]
+    output_token_entropy_val: List[float]
     # Hidden states
     output_hidden_states: List[List[float]]
@@ -826,9 +851,12 @@ class BatchTokenIDOut:
     placeholder_tokens_idx: List[Optional[List[int]]]
     placeholder_tokens_val: List[Optional[List[int]]]
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_steps: List[List[int]] = None
 @dataclass
-class BatchMultimodalDecodeReq:
+class BatchMultimodalDecodeReq(BaseBatchReq):
     decoded_ids: List[int]
     input_token_logprobs_val: List[float]
     input_token_logprobs_idx: List[int]
@@ -840,8 +868,6 @@ class BatchMultimodalDecodeReq:
     image_resolutions: List[List[int]]
     resize_image_resolutions: List[List[int]]
-    # The request id
-    rids: List[str]
     finished_reasons: List[BaseFinishReason]
     # Token counts
@@ -849,17 +875,20 @@ class BatchMultimodalDecodeReq:
     completion_tokens: List[int]
     cached_tokens: List[int]
-    # Placeholder token info
+    # The information of placeholder tokens (e.g., image token)
+    # idx is the index of the token in the prompt after expansion.
+    # val is the length of padded tokens after expansion.
     placeholder_tokens_idx: List[Optional[List[int]]]
     placeholder_tokens_val: List[Optional[List[int]]]
-    return_bytes: bool = False
+    return_bytes: List[bool]
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_steps: List[List[int]] = None
 @dataclass
-class BatchStrOut:
-    # The request id
-    rids: List[str]
+class BatchStrOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[dict]
     # The output decoded strings
@@ -872,6 +901,7 @@ class BatchStrOut:
     completion_tokens: List[int]
     cached_tokens: List[int]
     spec_verify_ct: List[int]
+    spec_accepted_tokens: List[int]
     # Logprobs
     input_token_logprobs_val: List[float]
@@ -886,18 +916,23 @@ class BatchStrOut:
     input_token_ids_logprobs_idx: List[List]
     output_token_ids_logprobs_val: List[List]
     output_token_ids_logprobs_idx: List[List]
+    output_token_entropy_val: List[float]
     # Hidden states
     output_hidden_states: List[List[float]]
+    # The information of placeholder tokens (e.g., image token)
+    # idx is the index of the token in the prompt after expansion.
+    # val is the length of padded tokens after expansion.
     placeholder_tokens_idx: List[Optional[List[int]]]
     placeholder_tokens_val: List[Optional[List[int]]]
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_steps: List[List[int]] = None
 @dataclass
-class BatchMultimodalOut:
-    # The request id
-    rids: List[str]
+class BatchMultimodalOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[dict]
     decoded_ids: List[List[int]]
@@ -922,13 +957,11 @@ class BatchMultimodalOut:
 @dataclass
-class BatchEmbeddingOut:
-    # The request id
-    rids: List[str]
+class BatchEmbeddingOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[BaseFinishReason]
     # The output embedding
-    embeddings: List[List[float]]
+    embeddings: Union[List[List[float]], List[Dict[int, float]]]
     # Token counts
     prompt_tokens: List[int]
     cached_tokens: List[int]
@@ -938,27 +971,27 @@ class BatchEmbeddingOut:
 @dataclass
-class ClearHiCacheReqInput:
+class ClearHiCacheReqInput(BaseReq):
     pass
 @dataclass
-class ClearHiCacheReqOutput:
+class ClearHiCacheReqOutput(BaseReq):
     success: bool
 @dataclass
-class FlushCacheReqInput:
+class FlushCacheReqInput(BaseReq):
     pass
 @dataclass
-class FlushCacheReqOutput:
+class FlushCacheReqOutput(BaseReq):
     success: bool
 @dataclass
-class UpdateWeightFromDiskReqInput:
+class UpdateWeightFromDiskReqInput(BaseReq):
     # The model path with the new weights
     model_path: str
     # The format to load the weights
@@ -973,10 +1006,12 @@ class UpdateWeightFromDiskReqInput:
     torch_empty_cache: bool = False
     # Whether to keep the scheduler paused after weight update
     keep_pause: bool = False
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_step: int = 0
 @dataclass
-class UpdateWeightFromDiskReqOutput:
+class UpdateWeightFromDiskReqOutput(BaseReq):
     success: bool
     message: str
     # Number of paused requests during weight sync.
@@ -984,7 +1019,7 @@ class UpdateWeightFromDiskReqOutput:
 @dataclass
-class UpdateWeightsFromDistributedReqInput:
+class UpdateWeightsFromDistributedReqInput(BaseReq):
     names: List[str]
     dtypes: List[str]
     shapes: List[List[int]]
@@ -999,13 +1034,13 @@ class UpdateWeightsFromDistributedReqInput:
 @dataclass
-class UpdateWeightsFromDistributedReqOutput:
+class UpdateWeightsFromDistributedReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class UpdateWeightsFromTensorReqInput:
+class UpdateWeightsFromTensorReqInput(BaseReq):
     """Update model weights from tensor input.
     - Tensors are serialized for transmission
@@ -1024,13 +1059,13 @@ class UpdateWeightsFromTensorReqInput:
 @dataclass
-class UpdateWeightsFromTensorReqOutput:
+class UpdateWeightsFromTensorReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class InitWeightsSendGroupForRemoteInstanceReqInput:
+class InitWeightsSendGroupForRemoteInstanceReqInput(BaseReq):
     # The master address
     master_address: str
     # The ports for each rank's communication group
@@ -1045,14 +1080,32 @@ class InitWeightsSendGroupForRemoteInstanceReqInput:
     backend: str = "nccl"
+# Now UpdateWeightsFromIPCReqInput and UpdateWeightsFromIPCReqOutput
+# are only used by Checkpoint Engine (https://github.com/MoonshotAI/checkpoint-engine)
+@dataclass
+class UpdateWeightsFromIPCReqInput(BaseReq):
+    # ZMQ socket paths for each device UUID
+    zmq_handles: Dict[str, str]
+    # Whether to flush cache after weight update
+    flush_cache: bool = True
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+@dataclass
+class UpdateWeightsFromIPCReqOutput(BaseReq):
+    success: bool
+    message: str
 @dataclass
-class InitWeightsSendGroupForRemoteInstanceReqOutput:
+class InitWeightsSendGroupForRemoteInstanceReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class SendWeightsToRemoteInstanceReqInput:
+class SendWeightsToRemoteInstanceReqInput(BaseReq):
     # The master address
     master_address: str
     # The ports for each rank's communication group
@@ -1062,13 +1115,13 @@ class SendWeightsToRemoteInstanceReqInput:
 @dataclass
-class SendWeightsToRemoteInstanceReqOutput:
+class SendWeightsToRemoteInstanceReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class InitWeightsUpdateGroupReqInput:
+class InitWeightsUpdateGroupReqInput(BaseReq):
     # The master address
     master_address: str
     # The master port
@@ -1084,13 +1137,24 @@ class InitWeightsUpdateGroupReqInput:
 @dataclass
-class InitWeightsUpdateGroupReqOutput:
+class InitWeightsUpdateGroupReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class UpdateWeightVersionReqInput:
+class DestroyWeightsUpdateGroupReqInput(BaseReq):
+    group_name: str = "weight_update_group"
+@dataclass
+class DestroyWeightsUpdateGroupReqOutput(BaseReq):
+    success: bool
+    message: str
+@dataclass
+class UpdateWeightVersionReqInput(BaseReq):
     # The new weight version
     new_version: str
     # Whether to abort all running requests before updating
@@ -1098,89 +1162,87 @@ class UpdateWeightVersionReqInput:
 @dataclass
-class GetWeightsByNameReqInput:
+class GetWeightsByNameReqInput(BaseReq):
     name: str
     truncate_size: int = 100
 @dataclass
-class GetWeightsByNameReqOutput:
+class GetWeightsByNameReqOutput(BaseReq):
     parameter: list
 @dataclass
-class ReleaseMemoryOccupationReqInput:
+class ReleaseMemoryOccupationReqInput(BaseReq):
     # Optional tags to identify the memory region, which is primarily used for RL
     # Currently we only support `weights` and `kv_cache`
     tags: Optional[List[str]] = None
 @dataclass
-class ReleaseMemoryOccupationReqOutput:
+class ReleaseMemoryOccupationReqOutput(BaseReq):
     pass
 @dataclass
-class ResumeMemoryOccupationReqInput:
+class ResumeMemoryOccupationReqInput(BaseReq):
     # Optional tags to identify the memory region, which is primarily used for RL
     # Currently we only support `weights` and `kv_cache`
     tags: Optional[List[str]] = None
 @dataclass
-class ResumeMemoryOccupationReqOutput:
+class ResumeMemoryOccupationReqOutput(BaseReq):
     pass
 @dataclass
-class SlowDownReqInput:
+class SlowDownReqInput(BaseReq):
     forward_sleep_time: Optional[float]
 @dataclass
-class SlowDownReqOutput:
+class SlowDownReqOutput(BaseReq):
     pass
 @dataclass
-class AbortReq:
-    # The request id
-    rid: str = ""
+class AbortReq(BaseReq):
     # Whether to abort all requests
     abort_all: bool = False
     # The finished reason data
     finished_reason: Optional[Dict[str, Any]] = None
     abort_reason: Optional[str] = None
-    # used in MultiTokenzierManager mode
-    rids: Optional[Union[List[str], str]] = None
     def __post_init__(self):
-        self.rids = self.rid
+        # FIXME: This is a hack to keep the same with the old code
+        if self.rid is None:
+            self.rid = ""
 @dataclass
-class GetInternalStateReq:
+class GetInternalStateReq(BaseReq):
     pass
 @dataclass
-class GetInternalStateReqOutput:
+class GetInternalStateReqOutput(BaseReq):
     internal_state: Dict[Any, Any]
 @dataclass
-class SetInternalStateReq:
+class SetInternalStateReq(BaseReq):
     server_args: Dict[str, Any]
 @dataclass
-class SetInternalStateReqOutput:
+class SetInternalStateReqOutput(BaseReq):
     updated: bool
     server_args: Dict[str, Any]
 @dataclass
-class ProfileReqInput:
+class ProfileReqInput(BaseReq):
     # The output directory
     output_dir: Optional[str] = None
     # If set, it profile as many as this number of steps.
@@ -1192,6 +1254,8 @@ class ProfileReqInput:
     profile_by_stage: bool = False
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
+    # Merge profiles from all ranks into a single trace
+    merge_profiles: bool = False
 class ProfileReqType(Enum):
@@ -1200,7 +1264,7 @@ class ProfileReqType(Enum):
 @dataclass
-class ProfileReq:
+class ProfileReq(BaseReq):
     type: ProfileReqType
     output_dir: Optional[str] = None
     start_step: Optional[int] = None
@@ -1210,21 +1274,23 @@ class ProfileReq:
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
     profile_id: Optional[str] = None
+    # Merge profiles from all ranks into a single trace
+    merge_profiles: bool = False
 @dataclass
-class ProfileReqOutput:
+class ProfileReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class FreezeGCReq:
+class FreezeGCReq(BaseReq):
     pass
 @dataclass
-class ConfigureLoggingReq:
+class ConfigureLoggingReq(BaseReq):
     log_requests: Optional[bool] = None
     log_requests_level: Optional[int] = None
     dump_requests_folder: Optional[str] = None
@@ -1233,35 +1299,39 @@ class ConfigureLoggingReq:
 @dataclass
-class OpenSessionReqInput:
+class OpenSessionReqInput(BaseReq):
     capacity_of_str_len: int
     session_id: Optional[str] = None
 @dataclass
-class CloseSessionReqInput:
+class CloseSessionReqInput(BaseReq):
     session_id: str
 @dataclass
-class OpenSessionReqOutput:
+class OpenSessionReqOutput(BaseReq):
     session_id: Optional[str]
     success: bool
 @dataclass
-class HealthCheckOutput:
+class HealthCheckOutput(BaseReq):
     pass
-class ExpertDistributionReq(Enum):
+class ExpertDistributionReqType(Enum):
     START_RECORD = 1
     STOP_RECORD = 2
     DUMP_RECORD = 3
+class ExpertDistributionReq(BaseReq):
+    action: ExpertDistributionReqType
 @dataclass
-class ExpertDistributionReqOutput:
+class ExpertDistributionReqOutput(BaseReq):
     pass
@@ -1279,7 +1349,7 @@ class Tool:
 @dataclass
-class ParseFunctionCallReq:
+class ParseFunctionCallReq(BaseReq):
     text: str  # The text to parse.
     tools: List[Tool] = field(
         default_factory=list
@@ -1290,31 +1360,31 @@ class ParseFunctionCallReq:
 @dataclass
-class SeparateReasoningReqInput:
+class SeparateReasoningReqInput(BaseReq):
     text: str  # The text to parse.
     reasoning_parser: str  # Specify the parser type, e.g., "deepseek-r1".
 @dataclass
-class VertexGenerateReqInput:
+class VertexGenerateReqInput(BaseReq):
     instances: List[dict]
     parameters: Optional[dict] = None
 @dataclass
-class RpcReqInput:
+class RpcReqInput(BaseReq):
     method: str
     parameters: Optional[Dict] = None
 @dataclass
-class RpcReqOutput:
+class RpcReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class LoadLoRAAdapterReqInput:
+class LoadLoRAAdapterReqInput(BaseReq):
     # The name of the lora module to newly loaded.
     lora_name: str
     # The path of loading.
@@ -1334,7 +1404,7 @@ class LoadLoRAAdapterReqInput:
 @dataclass
-class UnloadLoRAAdapterReqInput:
+class UnloadLoRAAdapterReqInput(BaseReq):
     # The name of lora module to unload.
     lora_name: str
     # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
@@ -1348,25 +1418,13 @@ class UnloadLoRAAdapterReqInput:
 @dataclass
-class LoRAUpdateResult:
+class LoRAUpdateOutput(BaseReq):
     success: bool
     error_message: Optional[str] = None
     loaded_adapters: Optional[Dict[str, LoRARef]] = None
-LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
-@dataclass
-class MultiTokenizerRegisterReq:
-    rids: Optional[Union[List[str], str]] = None
-    ipc_name: Optional[str] = None
-@dataclass
-class MultiTokenizerWrapper:
-    worker_id: int
-    obj: Optional[Any] = None
+LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateOutput
 class BlockReqType(Enum):
@@ -1375,17 +1433,17 @@ class BlockReqType(Enum):
 @dataclass
-class BlockReqInput:
+class BlockReqInput(BaseReq):
     type: BlockReqType
 @dataclass
-class GetLoadReqInput:
+class GetLoadReqInput(BaseReq):
     pass
 @dataclass
-class GetLoadReqOutput:
+class GetLoadReqOutput(BaseReq):
     dp_rank: int
     num_reqs: int
     num_waiting_reqs: int
@@ -1393,5 +1451,41 @@ class GetLoadReqOutput:
 @dataclass
-class WatchLoadUpdateReq:
+class WatchLoadUpdateReq(BaseReq):
     loads: List[GetLoadReqOutput]
+@dataclass
+class LazyDumpTensorsReqInput(BaseReq):
+    pass
+@dataclass
+class LazyDumpTensorsReqOutput(BaseReq):
+    success: bool
+def _check_all_req_types():
+    """A helper function to check all request types are defined in this file."""
+    import inspect
+    import sys
+    all_classes = inspect.getmembers(sys.modules[__name__], inspect.isclass)
+    for class_type in all_classes:
+        # check its name
+        name = class_type[0]
+        is_io_struct = (
+            name.endswith("Req") or name.endswith("Input") or name.endswith("Output")
+        )
+        is_base_req = issubclass(class_type[1], BaseReq) or issubclass(
+            class_type[1], BaseBatchReq
+        )
+        if is_io_struct and not is_base_req:
+            raise ValueError(f"{name} is not a subclass of BaseReq or BaseBatchReq.")
+        if is_base_req and not is_io_struct:
+            raise ValueError(
+                f"{name} is a subclass of BaseReq but not follow the naming convention."
+            )
+_check_all_req_types()

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl