PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +321 -31
sglang/bench_serving.py +10 -3
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +8 -0
sglang/srt/configs/model_config.py +160 -105
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/constrained/base_grammar_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +6 -4
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/common/conn.py +266 -98
sglang/srt/disaggregation/decode.py +50 -9
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/mooncake/conn.py +51 -541
sglang/srt/disaggregation/nixl/conn.py +148 -39
sglang/srt/disaggregation/prefill.py +31 -14
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +135 -80
sglang/srt/entrypoints/engine.py +23 -3
sglang/srt/entrypoints/grpc_request_manager.py +330 -55
sglang/srt/entrypoints/grpc_server.py +232 -102
sglang/srt/entrypoints/http_server.py +49 -9
sglang/srt/entrypoints/openai/protocol.py +110 -5
sglang/srt/entrypoints/openai/serving_base.py +25 -6
sglang/srt/entrypoints/openai/serving_chat.py +178 -49
sglang/srt/entrypoints/openai/serving_completions.py +5 -3
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +42 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/function_call/function_call_parser.py +3 -2
sglang/srt/function_call/glm4_moe_detector.py +3 -3
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
sglang/srt/layers/activation.py +7 -6
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +108 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +112 -194
sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
sglang/srt/layers/attention/mamba/mamba.py +566 -1
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +42 -9
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +11 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +2 -0
sglang/srt/layers/linear.py +21 -4
sglang/srt/layers/logits_processor.py +15 -2
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +147 -74
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
sglang/srt/layers/moe/utils.py +10 -0
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +44 -9
sglang/srt/layers/quantization/mxfp4.py +12 -4
sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
sglang/srt/layers/quantization/w4afp8.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +15 -3
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +52 -4
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +10 -4
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +17 -6
sglang/srt/lora/mem_pool.py +1 -1
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +42 -142
sglang/srt/managers/data_parallel_controller.py +11 -46
sglang/srt/managers/detokenizer_manager.py +11 -11
sglang/srt/managers/io_struct.py +162 -118
sglang/srt/managers/mm_utils.py +43 -6
sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +167 -86
sglang/srt/managers/schedule_policy.py +143 -16
sglang/srt/managers/scheduler.py +359 -214
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
sglang/srt/managers/tokenizer_manager.py +84 -136
sglang/srt/managers/tp_worker.py +39 -29
sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +40 -1
sglang/srt/mem_cache/hiradix_cache.py +119 -32
sglang/srt/mem_cache/memory_pool.py +188 -10
sglang/srt/mem_cache/memory_pool_host.py +134 -182
sglang/srt/mem_cache/radix_cache.py +222 -71
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
sglang/srt/mem_cache/swa_radix_cache.py +25 -34
sglang/srt/metrics/collector.py +82 -120
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -32
sglang/srt/model_executor/forward_batch_info.py +23 -38
sglang/srt/model_executor/model_runner.py +131 -183
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/loader.py +14 -10
sglang/srt/model_loader/weight_utils.py +156 -2
sglang/srt/models/bailing_moe.py +27 -4
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +536 -153
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +1 -1
sglang/srt/models/gpt_oss.py +7 -30
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/longcat_flash.py +1 -1
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +15 -4
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +2 -2
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +64 -1
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +31 -3
sglang/srt/models/qwen3_next.py +36 -9
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +2 -3
sglang/srt/multimodal/processors/internvl.py +20 -8
sglang/srt/multimodal/processors/qwen_vl.py +8 -1
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +20 -2
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +753 -295
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
sglang/srt/speculative/eagle_worker.py +57 -25
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +47 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +399 -74
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +1 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +12 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +355 -4
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -18,6 +18,7 @@ processes (TokenizerManager, DetokenizerManager, Scheduler).
 import copy
 import uuid
+from abc import ABC
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
@@ -35,10 +36,33 @@ else:
     Image = Any
+# Parameters for a session
+@dataclass
+class BaseReq(ABC):
+    rid: Optional[Union[str, List[str]]] = field(default=None, kw_only=True)
+    def regenerate_rid(self):
+        """Generate a new request ID and return it."""
+        if isinstance(self.rid, list):
+            self.rid = [uuid.uuid4().hex for _ in range(len(self.rid))]
+        else:
+            self.rid = uuid.uuid4().hex
+        return self.rid
+@dataclass
+class BaseBatchReq(ABC):
+    rids: Optional[List[str]] = field(default=None, kw_only=True)
+    def regenerate_rids(self):
+        """Generate new request IDs and return them."""
+        self.rids = [uuid.uuid4().hex for _ in range(len(self.rids))]
+        return self.rids
 @dataclass
 class SessionParams:
     id: Optional[str] = None
-    rid: Optional[str] = None
     offset: Optional[int] = None
     replace: Optional[bool] = None
     drop_previous_output: Optional[bool] = None
@@ -62,7 +86,7 @@ MultimodalDataInputFormat = Union[
 @dataclass
-class GenerateReqInput:
+class GenerateReqInput(BaseReq):
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[str], str]] = None
     # The token ids for text; one can specify either text or input_ids
@@ -82,8 +106,6 @@ class GenerateReqInput:
     audio_data: Optional[MultimodalDataInputFormat] = None
     # The sampling_params. See descriptions below.
     sampling_params: Optional[Union[List[Dict], Dict]] = None
-    # The request id.
-    rid: Optional[Union[List[str], str]] = None
     # Whether to return logprobs.
     return_logprob: Optional[Union[List[bool], bool]] = None
     # If return logprobs, the start location in the prompt for returning logprobs.
@@ -132,17 +154,20 @@ class GenerateReqInput:
     # Conversation id used for tracking requests
     conversation_id: Optional[str] = None
-    # Label for the request
-    label: Optional[str] = None
     # Priority for the request
     priority: Optional[int] = None
-    # Image gen grpc migration
-    return_bytes: bool = False
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[Union[List[str], str]] = None
-    # For customer metric labels
-    customer_labels: Optional[Dict[str, str]] = None
+    # Whether to disallow logging for this request (e.g. due to ZDR)
+    no_logs: bool = False
+    # For custom metric labels
+    custom_labels: Optional[Dict[str, str]] = None
+    # (Internal) Whether to return bytes for image generation
+    return_bytes: bool = False
     def contains_mm_input(self) -> bool:
         return (
@@ -485,11 +510,6 @@ class GenerateReqInput:
             ):
                 raise ValueError("Session params must be a dict or a list of dicts.")
-    def regenerate_rid(self):
-        """Generate a new request ID and return it."""
-        self.rid = uuid.uuid4().hex
-        return self.rid
     def __getitem__(self, i):
         return GenerateReqInput(
             text=self.text[i] if self.text is not None else None,
@@ -542,16 +562,16 @@ class GenerateReqInput:
                 self.data_parallel_rank if self.data_parallel_rank is not None else None
             ),
             conversation_id=self.conversation_id,
-            label=self.label,
             priority=self.priority,
+            extra_key=self.extra_key,
+            no_logs=self.no_logs,
+            custom_labels=self.custom_labels,
             return_bytes=self.return_bytes,
         )
 @dataclass
-class TokenizedGenerateReqInput:
-    # The request id
-    rid: str
+class TokenizedGenerateReqInput(BaseReq):
     # The input text
     input_text: str
     # The input token ids
@@ -570,6 +590,7 @@ class TokenizedGenerateReqInput:
     token_ids_logprob: List[int]
     # Whether to stream output
     stream: bool
     # Whether to return hidden states
     return_hidden_states: bool = False
@@ -596,24 +617,24 @@ class TokenizedGenerateReqInput:
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
-    # For dp balance
-    dp_balance_id: int = -1
-    # Label for the request
-    label: Optional[str] = None
     # Priority for the request
     priority: Optional[int] = None
-    # Image gen grpc migration
-    return_bytes: bool = False
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[str] = None
+    # Whether to disallow logging for this request (e.g. due to ZDR)
+    no_logs: bool = False
     # tracing context
     trace_context: Optional[Dict] = None
+    # (Internal) Whether to return bytes for image generation
+    return_bytes: bool = False
 @dataclass
-class BatchTokenizedGenerateReqInput:
+class BatchTokenizedGenerateReqInput(BaseBatchReq):
     # The batch of tokenized requests
     batch: List[TokenizedGenerateReqInput]
@@ -628,7 +649,7 @@ class BatchTokenizedGenerateReqInput:
 @dataclass
-class EmbeddingReqInput:
+class EmbeddingReqInput(BaseReq):
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[List[str]], List[str], str]] = None
     # The image input. It can be an image instance, file name, URL, or base64 encoded string.
@@ -644,8 +665,6 @@ class EmbeddingReqInput:
     audio_data: Optional[MultimodalDataInputFormat] = None
     # The token ids for text; one can either specify text or input_ids.
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
-    # The request id.
-    rid: Optional[Union[List[str], str]] = None
     # Dummy sampling params for compatibility
     sampling_params: Optional[Union[List[Dict], Dict]] = None
     # Dummy input embeds for compatibility
@@ -656,6 +675,8 @@ class EmbeddingReqInput:
     modalities: Optional[List[str]] = None
     # For cross-encoder requests
     is_cross_encoder_request: bool = False
+    # Priority for the request
+    priority: Optional[int] = None
     # For background responses (OpenAI responses API)
     background: bool = False
@@ -714,10 +735,6 @@ class EmbeddingReqInput:
             for i in range(self.batch_size):
                 self.sampling_params[i]["max_new_tokens"] = 0
-    def regenerate_rid(self):
-        self.rid = uuid.uuid4().hex
-        return self.rid
     def contains_mm_input(self) -> bool:
         return (
             has_valid_data(self.image_data)
@@ -746,9 +763,7 @@ class EmbeddingReqInput:
 @dataclass
-class TokenizedEmbeddingReqInput:
-    # The request id
-    rid: str
+class TokenizedEmbeddingReqInput(BaseReq):
     # The input text
     input_text: str
     # The input token ids
@@ -761,12 +776,12 @@ class TokenizedEmbeddingReqInput:
     sampling_params: SamplingParams
     # For data parallel rank routing
     data_parallel_rank: Optional[int] = None
-    # For dp balance
-    dp_balance_id: int = -1
+    # Priority for the request
+    priority: Optional[int] = None
 @dataclass
-class BatchTokenizedEmbeddingReqInput:
+class BatchTokenizedEmbeddingReqInput(BaseBatchReq):
     # The batch of tokenized embedding requests
     batch: List[TokenizedEmbeddingReqInput]
@@ -781,9 +796,7 @@ class BatchTokenizedEmbeddingReqInput:
 @dataclass
-class BatchTokenIDOut:
-    # The request id
-    rids: List[str]
+class BatchTokenIDOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[BaseFinishReason]
     # For incremental decoding
@@ -828,7 +841,7 @@ class BatchTokenIDOut:
 @dataclass
-class BatchMultimodalDecodeReq:
+class BatchMultimodalDecodeReq(BaseBatchReq):
     decoded_ids: List[int]
     input_token_logprobs_val: List[float]
     input_token_logprobs_idx: List[int]
@@ -840,8 +853,6 @@ class BatchMultimodalDecodeReq:
     image_resolutions: List[List[int]]
     resize_image_resolutions: List[List[int]]
-    # The request id
-    rids: List[str]
     finished_reasons: List[BaseFinishReason]
     # Token counts
@@ -857,9 +868,7 @@ class BatchMultimodalDecodeReq:
 @dataclass
-class BatchStrOut:
-    # The request id
-    rids: List[str]
+class BatchStrOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[dict]
     # The output decoded strings
@@ -895,9 +904,7 @@ class BatchStrOut:
 @dataclass
-class BatchMultimodalOut:
-    # The request id
-    rids: List[str]
+class BatchMultimodalOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[dict]
     decoded_ids: List[List[int]]
@@ -922,9 +929,7 @@ class BatchMultimodalOut:
 @dataclass
-class BatchEmbeddingOut:
-    # The request id
-    rids: List[str]
+class BatchEmbeddingOutput(BaseBatchReq):
     # The finish reason
     finished_reasons: List[BaseFinishReason]
     # The output embedding
@@ -938,27 +943,27 @@ class BatchEmbeddingOut:
 @dataclass
-class ClearHiCacheReqInput:
+class ClearHiCacheReqInput(BaseReq):
     pass
 @dataclass
-class ClearHiCacheReqOutput:
+class ClearHiCacheReqOutput(BaseReq):
     success: bool
 @dataclass
-class FlushCacheReqInput:
+class FlushCacheReqInput(BaseReq):
     pass
 @dataclass
-class FlushCacheReqOutput:
+class FlushCacheReqOutput(BaseReq):
     success: bool
 @dataclass
-class UpdateWeightFromDiskReqInput:
+class UpdateWeightFromDiskReqInput(BaseReq):
     # The model path with the new weights
     model_path: str
     # The format to load the weights
@@ -976,7 +981,7 @@ class UpdateWeightFromDiskReqInput:
 @dataclass
-class UpdateWeightFromDiskReqOutput:
+class UpdateWeightFromDiskReqOutput(BaseReq):
     success: bool
     message: str
     # Number of paused requests during weight sync.
@@ -984,7 +989,7 @@ class UpdateWeightFromDiskReqOutput:
 @dataclass
-class UpdateWeightsFromDistributedReqInput:
+class UpdateWeightsFromDistributedReqInput(BaseReq):
     names: List[str]
     dtypes: List[str]
     shapes: List[List[int]]
@@ -999,13 +1004,13 @@ class UpdateWeightsFromDistributedReqInput:
 @dataclass
-class UpdateWeightsFromDistributedReqOutput:
+class UpdateWeightsFromDistributedReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class UpdateWeightsFromTensorReqInput:
+class UpdateWeightsFromTensorReqInput(BaseReq):
     """Update model weights from tensor input.
     - Tensors are serialized for transmission
@@ -1024,13 +1029,13 @@ class UpdateWeightsFromTensorReqInput:
 @dataclass
-class UpdateWeightsFromTensorReqOutput:
+class UpdateWeightsFromTensorReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class InitWeightsSendGroupForRemoteInstanceReqInput:
+class InitWeightsSendGroupForRemoteInstanceReqInput(BaseReq):
     # The master address
     master_address: str
     # The ports for each rank's communication group
@@ -1046,13 +1051,13 @@ class InitWeightsSendGroupForRemoteInstanceReqInput:
 @dataclass
-class InitWeightsSendGroupForRemoteInstanceReqOutput:
+class InitWeightsSendGroupForRemoteInstanceReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class SendWeightsToRemoteInstanceReqInput:
+class SendWeightsToRemoteInstanceReqInput(BaseReq):
     # The master address
     master_address: str
     # The ports for each rank's communication group
@@ -1062,13 +1067,13 @@ class SendWeightsToRemoteInstanceReqInput:
 @dataclass
-class SendWeightsToRemoteInstanceReqOutput:
+class SendWeightsToRemoteInstanceReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class InitWeightsUpdateGroupReqInput:
+class InitWeightsUpdateGroupReqInput(BaseReq):
     # The master address
     master_address: str
     # The master port
@@ -1084,13 +1089,24 @@ class InitWeightsUpdateGroupReqInput:
 @dataclass
-class InitWeightsUpdateGroupReqOutput:
+class InitWeightsUpdateGroupReqOutput(BaseReq):
+    success: bool
+    message: str
+@dataclass
+class DestroyWeightsUpdateGroupReqInput(BaseReq):
+    group_name: str = "weight_update_group"
+@dataclass
+class DestroyWeightsUpdateGroupReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class UpdateWeightVersionReqInput:
+class UpdateWeightVersionReqInput(BaseReq):
     # The new weight version
     new_version: str
     # Whether to abort all running requests before updating
@@ -1098,89 +1114,87 @@ class UpdateWeightVersionReqInput:
 @dataclass
-class GetWeightsByNameReqInput:
+class GetWeightsByNameReqInput(BaseReq):
     name: str
     truncate_size: int = 100
 @dataclass
-class GetWeightsByNameReqOutput:
+class GetWeightsByNameReqOutput(BaseReq):
     parameter: list
 @dataclass
-class ReleaseMemoryOccupationReqInput:
+class ReleaseMemoryOccupationReqInput(BaseReq):
     # Optional tags to identify the memory region, which is primarily used for RL
     # Currently we only support `weights` and `kv_cache`
     tags: Optional[List[str]] = None
 @dataclass
-class ReleaseMemoryOccupationReqOutput:
+class ReleaseMemoryOccupationReqOutput(BaseReq):
     pass
 @dataclass
-class ResumeMemoryOccupationReqInput:
+class ResumeMemoryOccupationReqInput(BaseReq):
     # Optional tags to identify the memory region, which is primarily used for RL
     # Currently we only support `weights` and `kv_cache`
     tags: Optional[List[str]] = None
 @dataclass
-class ResumeMemoryOccupationReqOutput:
+class ResumeMemoryOccupationReqOutput(BaseReq):
     pass
 @dataclass
-class SlowDownReqInput:
+class SlowDownReqInput(BaseReq):
     forward_sleep_time: Optional[float]
 @dataclass
-class SlowDownReqOutput:
+class SlowDownReqOutput(BaseReq):
     pass
 @dataclass
-class AbortReq:
-    # The request id
-    rid: str = ""
+class AbortReq(BaseReq):
     # Whether to abort all requests
     abort_all: bool = False
     # The finished reason data
     finished_reason: Optional[Dict[str, Any]] = None
     abort_reason: Optional[str] = None
-    # used in MultiTokenzierManager mode
-    rids: Optional[Union[List[str], str]] = None
     def __post_init__(self):
-        self.rids = self.rid
+        # FIXME: This is a hack to keep the same with the old code
+        if self.rid is None:
+            self.rid = ""
 @dataclass
-class GetInternalStateReq:
+class GetInternalStateReq(BaseReq):
     pass
 @dataclass
-class GetInternalStateReqOutput:
+class GetInternalStateReqOutput(BaseReq):
     internal_state: Dict[Any, Any]
 @dataclass
-class SetInternalStateReq:
+class SetInternalStateReq(BaseReq):
     server_args: Dict[str, Any]
 @dataclass
-class SetInternalStateReqOutput:
+class SetInternalStateReqOutput(BaseReq):
     updated: bool
     server_args: Dict[str, Any]
 @dataclass
-class ProfileReqInput:
+class ProfileReqInput(BaseReq):
     # The output directory
     output_dir: Optional[str] = None
     # If set, it profile as many as this number of steps.
@@ -1200,7 +1214,7 @@ class ProfileReqType(Enum):
 @dataclass
-class ProfileReq:
+class ProfileReq(BaseReq):
     type: ProfileReqType
     output_dir: Optional[str] = None
     start_step: Optional[int] = None
@@ -1213,18 +1227,18 @@ class ProfileReq:
 @dataclass
-class ProfileReqOutput:
+class ProfileReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class FreezeGCReq:
+class FreezeGCReq(BaseReq):
     pass
 @dataclass
-class ConfigureLoggingReq:
+class ConfigureLoggingReq(BaseReq):
     log_requests: Optional[bool] = None
     log_requests_level: Optional[int] = None
     dump_requests_folder: Optional[str] = None
@@ -1233,35 +1247,39 @@ class ConfigureLoggingReq:
 @dataclass
-class OpenSessionReqInput:
+class OpenSessionReqInput(BaseReq):
     capacity_of_str_len: int
     session_id: Optional[str] = None
 @dataclass
-class CloseSessionReqInput:
+class CloseSessionReqInput(BaseReq):
     session_id: str
 @dataclass
-class OpenSessionReqOutput:
+class OpenSessionReqOutput(BaseReq):
     session_id: Optional[str]
     success: bool
 @dataclass
-class HealthCheckOutput:
+class HealthCheckOutput(BaseReq):
     pass
-class ExpertDistributionReq(Enum):
+class ExpertDistributionReqType(Enum):
     START_RECORD = 1
     STOP_RECORD = 2
     DUMP_RECORD = 3
+class ExpertDistributionReq(BaseReq):
+    action: ExpertDistributionReqType
 @dataclass
-class ExpertDistributionReqOutput:
+class ExpertDistributionReqOutput(BaseReq):
     pass
@@ -1279,7 +1297,7 @@ class Tool:
 @dataclass
-class ParseFunctionCallReq:
+class ParseFunctionCallReq(BaseReq):
     text: str  # The text to parse.
     tools: List[Tool] = field(
         default_factory=list
@@ -1290,31 +1308,31 @@ class ParseFunctionCallReq:
 @dataclass
-class SeparateReasoningReqInput:
+class SeparateReasoningReqInput(BaseReq):
     text: str  # The text to parse.
     reasoning_parser: str  # Specify the parser type, e.g., "deepseek-r1".
 @dataclass
-class VertexGenerateReqInput:
+class VertexGenerateReqInput(BaseReq):
     instances: List[dict]
     parameters: Optional[dict] = None
 @dataclass
-class RpcReqInput:
+class RpcReqInput(BaseReq):
     method: str
     parameters: Optional[Dict] = None
 @dataclass
-class RpcReqOutput:
+class RpcReqOutput(BaseReq):
     success: bool
     message: str
 @dataclass
-class LoadLoRAAdapterReqInput:
+class LoadLoRAAdapterReqInput(BaseReq):
     # The name of the lora module to newly loaded.
     lora_name: str
     # The path of loading.
@@ -1334,7 +1352,7 @@ class LoadLoRAAdapterReqInput:
 @dataclass
-class UnloadLoRAAdapterReqInput:
+class UnloadLoRAAdapterReqInput(BaseReq):
     # The name of lora module to unload.
     lora_name: str
     # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
@@ -1348,23 +1366,23 @@ class UnloadLoRAAdapterReqInput:
 @dataclass
-class LoRAUpdateResult:
+class LoRAUpdateOutput(BaseReq):
     success: bool
     error_message: Optional[str] = None
     loaded_adapters: Optional[Dict[str, LoRARef]] = None
-LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
+LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateOutput
 @dataclass
-class MultiTokenizerRegisterReq:
-    rids: Optional[Union[List[str], str]] = None
+class MultiTokenizerRegisterReq(BaseBatchReq):
     ipc_name: Optional[str] = None
 @dataclass
 class MultiTokenizerWrapper:
+    # FIXME(lsyin): remove this
     worker_id: int
     obj: Optional[Any] = None
@@ -1375,17 +1393,17 @@ class BlockReqType(Enum):
 @dataclass
-class BlockReqInput:
+class BlockReqInput(BaseReq):
     type: BlockReqType
 @dataclass
-class GetLoadReqInput:
+class GetLoadReqInput(BaseReq):
     pass
 @dataclass
-class GetLoadReqOutput:
+class GetLoadReqOutput(BaseReq):
     dp_rank: int
     num_reqs: int
     num_waiting_reqs: int
@@ -1393,5 +1411,31 @@ class GetLoadReqOutput:
 @dataclass
-class WatchLoadUpdateReq:
+class WatchLoadUpdateReq(BaseReq):
     loads: List[GetLoadReqOutput]
+def _check_all_req_types():
+    """A helper function to check all request types are defined in this file."""
+    import inspect
+    import sys
+    all_classes = inspect.getmembers(sys.modules[__name__], inspect.isclass)
+    for class_type in all_classes:
+        # check its name
+        name = class_type[0]
+        is_io_struct = (
+            name.endswith("Req") or name.endswith("Input") or name.endswith("Output")
+        )
+        is_base_req = issubclass(class_type[1], BaseReq) or issubclass(
+            class_type[1], BaseBatchReq
+        )
+        if is_io_struct and not is_base_req:
+            raise ValueError(f"{name} is not a subclass of BaseReq or BaseBatchReq.")
+        if is_base_req and not is_io_struct:
+            raise ValueError(
+                f"{name} is a subclass of BaseReq but not follow the naming convention."
+            )
+_check_all_req_types()

sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl