sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +220 -378
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +9 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +143 -6
- sglang/srt/managers/schedule_batch.py +237 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +681 -259
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +224 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +44 -18
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +94 -36
- sglang/srt/model_executor/cuda_graph_runner.py +55 -24
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +208 -28
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -32
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +136 -52
- sglang/srt/speculative/build_eagle_tree.py +2 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
- sglang/srt/speculative/eagle_utils.py +92 -58
- sglang/srt/speculative/eagle_worker.py +186 -94
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@
|
|
14
14
|
"""DetokenizerManager is a process that detokenizes the token ids."""
|
15
15
|
|
16
16
|
import dataclasses
|
17
|
+
import json
|
17
18
|
import logging
|
18
19
|
import os
|
19
20
|
import signal
|
@@ -27,12 +28,21 @@ import zmq
|
|
27
28
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
28
29
|
from sglang.srt.managers.io_struct import (
|
29
30
|
BatchEmbeddingOut,
|
31
|
+
BatchMultimodalDecodeReq,
|
30
32
|
BatchStrOut,
|
31
33
|
BatchTokenIDOut,
|
32
34
|
)
|
33
35
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
34
|
-
from sglang.srt.utils import
|
35
|
-
|
36
|
+
from sglang.srt.utils import (
|
37
|
+
configure_logger,
|
38
|
+
get_zmq_socket,
|
39
|
+
kill_itself_when_parent_died,
|
40
|
+
)
|
41
|
+
from sglang.utils import (
|
42
|
+
TypeBasedDispatcher,
|
43
|
+
find_printable_text,
|
44
|
+
get_exception_traceback,
|
45
|
+
)
|
36
46
|
|
37
47
|
logger = logging.getLogger(__name__)
|
38
48
|
|
@@ -47,7 +57,6 @@ DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 <
|
|
47
57
|
class DecodeStatus:
|
48
58
|
"""Store the status of incremental decoding."""
|
49
59
|
|
50
|
-
vid: int
|
51
60
|
decoded_text: str
|
52
61
|
decode_ids: List[int]
|
53
62
|
surr_offset: int
|
@@ -82,6 +91,22 @@ class DetokenizerManager:
|
|
82
91
|
)
|
83
92
|
|
84
93
|
self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
|
94
|
+
self.is_dummy = server_args.load_format == "dummy"
|
95
|
+
|
96
|
+
self._request_dispatcher = TypeBasedDispatcher(
|
97
|
+
[
|
98
|
+
(BatchEmbeddingOut, self.handle_batch_embedding_out),
|
99
|
+
(BatchTokenIDOut, self.handle_batch_token_id_out),
|
100
|
+
(BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
|
101
|
+
]
|
102
|
+
)
|
103
|
+
|
104
|
+
def event_loop(self):
|
105
|
+
"""The event loop that handles requests"""
|
106
|
+
while True:
|
107
|
+
recv_obj = self.recv_from_scheduler.recv_pyobj()
|
108
|
+
output = self._request_dispatcher(recv_obj)
|
109
|
+
self.send_to_tokenizer.send_pyobj(output)
|
85
110
|
|
86
111
|
def trim_matched_stop(
|
87
112
|
self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
|
@@ -106,114 +131,110 @@ class DetokenizerManager:
|
|
106
131
|
return output[:-1]
|
107
132
|
return output
|
108
133
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
134
|
+
def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
|
135
|
+
# If it is embedding model, no detokenization is needed.
|
136
|
+
return recv_obj
|
137
|
+
|
138
|
+
def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
|
139
|
+
bs = len(recv_obj.rids)
|
140
|
+
|
141
|
+
# Initialize decode status
|
142
|
+
read_ids, surr_ids = [], []
|
143
|
+
for i in range(bs):
|
144
|
+
rid = recv_obj.rids[i]
|
145
|
+
if rid not in self.decode_status:
|
146
|
+
s = DecodeStatus(
|
147
|
+
decoded_text=recv_obj.decoded_texts[i],
|
148
|
+
decode_ids=recv_obj.decode_ids[i],
|
149
|
+
surr_offset=0,
|
150
|
+
read_offset=recv_obj.read_offsets[i],
|
151
|
+
)
|
152
|
+
self.decode_status[rid] = s
|
119
153
|
else:
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
vid = recv_obj.vids[i]
|
129
|
-
if rid not in self.decode_status or self.decode_status[rid].vid != vid:
|
130
|
-
s = DecodeStatus(
|
131
|
-
vid=vid,
|
132
|
-
decoded_text=recv_obj.decoded_texts[i],
|
133
|
-
decode_ids=recv_obj.decode_ids[i],
|
134
|
-
surr_offset=0,
|
135
|
-
read_offset=recv_obj.read_offsets[i],
|
136
|
-
)
|
137
|
-
self.decode_status[rid] = s
|
138
|
-
else:
|
139
|
-
s = self.decode_status[rid]
|
140
|
-
s.decode_ids = recv_obj.decode_ids[i]
|
141
|
-
|
142
|
-
read_ids.append(
|
143
|
-
self.trim_matched_stop(
|
144
|
-
s.decode_ids[s.surr_offset :],
|
145
|
-
recv_obj.finished_reasons[i],
|
146
|
-
recv_obj.no_stop_trim[i],
|
147
|
-
)
|
154
|
+
s = self.decode_status[rid]
|
155
|
+
s.decode_ids = recv_obj.decode_ids[i]
|
156
|
+
|
157
|
+
read_ids.append(
|
158
|
+
self.trim_matched_stop(
|
159
|
+
s.decode_ids[s.surr_offset :],
|
160
|
+
recv_obj.finished_reasons[i],
|
161
|
+
recv_obj.no_stop_trim[i],
|
148
162
|
)
|
149
|
-
surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
|
150
|
-
|
151
|
-
# TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
|
152
|
-
surr_texts = self.tokenizer.batch_decode(
|
153
|
-
surr_ids,
|
154
|
-
skip_special_tokens=recv_obj.skip_special_tokens[0],
|
155
|
-
spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
|
156
|
-
)
|
157
|
-
read_texts = self.tokenizer.batch_decode(
|
158
|
-
read_ids,
|
159
|
-
skip_special_tokens=recv_obj.skip_special_tokens[0],
|
160
|
-
spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
|
161
163
|
)
|
164
|
+
surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
|
165
|
+
|
166
|
+
# TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
|
167
|
+
surr_texts = self.tokenizer.batch_decode(
|
168
|
+
surr_ids,
|
169
|
+
skip_special_tokens=recv_obj.skip_special_tokens[0],
|
170
|
+
spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
|
171
|
+
)
|
172
|
+
read_texts = self.tokenizer.batch_decode(
|
173
|
+
read_ids,
|
174
|
+
skip_special_tokens=recv_obj.skip_special_tokens[0],
|
175
|
+
spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
|
176
|
+
)
|
162
177
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
)
|
177
|
-
new_text = read_texts[i][len(surr_texts[i]) :]
|
178
|
-
if recv_obj.finished_reasons[i] is None:
|
179
|
-
# Streaming chunk: update the decode status
|
180
|
-
if len(new_text) > 0 and not new_text.endswith("�"):
|
181
|
-
s.decoded_text = s.decoded_text + new_text
|
182
|
-
s.surr_offset = s.read_offset
|
183
|
-
s.read_offset = len(s.decode_ids)
|
184
|
-
new_text = ""
|
185
|
-
else:
|
186
|
-
new_text = find_printable_text(new_text)
|
187
|
-
|
188
|
-
output_strs.append(
|
189
|
-
self.trim_matched_stop(
|
190
|
-
s.decoded_text + new_text,
|
191
|
-
recv_obj.finished_reasons[i],
|
192
|
-
recv_obj.no_stop_trim[i],
|
193
|
-
)
|
178
|
+
# Incremental decoding
|
179
|
+
output_strs = []
|
180
|
+
for i in range(bs):
|
181
|
+
try:
|
182
|
+
s = self.decode_status[recv_obj.rids[i]]
|
183
|
+
except KeyError:
|
184
|
+
raise RuntimeError(
|
185
|
+
f"Decode status not found for request {recv_obj.rids[i]}. "
|
186
|
+
"It may be due to the request being evicted from the decode status due to memory pressure. "
|
187
|
+
"Please increase the maximum number of requests by setting "
|
188
|
+
"the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
|
189
|
+
f"The current value is {DETOKENIZER_MAX_STATES}. "
|
190
|
+
"For more details, see: https://github.com/sgl-project/sglang/issues/2812"
|
194
191
|
)
|
192
|
+
new_text = read_texts[i][len(surr_texts[i]) :]
|
193
|
+
if recv_obj.finished_reasons[i] is None:
|
194
|
+
# Streaming chunk: update the decode status
|
195
|
+
if len(new_text) > 0 and not new_text.endswith("�"):
|
196
|
+
s.decoded_text = s.decoded_text + new_text
|
197
|
+
s.surr_offset = s.read_offset
|
198
|
+
s.read_offset = len(s.decode_ids)
|
199
|
+
new_text = ""
|
200
|
+
else:
|
201
|
+
new_text = find_printable_text(new_text)
|
195
202
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
prompt_tokens=recv_obj.prompt_tokens,
|
202
|
-
completion_tokens=recv_obj.completion_tokens,
|
203
|
-
cached_tokens=recv_obj.cached_tokens,
|
204
|
-
spec_verify_ct=recv_obj.spec_verify_ct,
|
205
|
-
input_token_logprobs_val=recv_obj.input_token_logprobs_val,
|
206
|
-
input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
|
207
|
-
output_token_logprobs_val=recv_obj.output_token_logprobs_val,
|
208
|
-
output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
|
209
|
-
input_top_logprobs_val=recv_obj.input_top_logprobs_val,
|
210
|
-
input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
|
211
|
-
output_top_logprobs_val=recv_obj.output_top_logprobs_val,
|
212
|
-
output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
|
213
|
-
output_hidden_states=recv_obj.output_hidden_states,
|
203
|
+
output_strs.append(
|
204
|
+
self.trim_matched_stop(
|
205
|
+
s.decoded_text + new_text,
|
206
|
+
recv_obj.finished_reasons[i],
|
207
|
+
recv_obj.no_stop_trim[i],
|
214
208
|
)
|
215
209
|
)
|
216
210
|
|
211
|
+
return BatchStrOut(
|
212
|
+
rids=recv_obj.rids,
|
213
|
+
finished_reasons=recv_obj.finished_reasons,
|
214
|
+
output_strs=output_strs,
|
215
|
+
output_ids=None,
|
216
|
+
prompt_tokens=recv_obj.prompt_tokens,
|
217
|
+
completion_tokens=recv_obj.completion_tokens,
|
218
|
+
cached_tokens=recv_obj.cached_tokens,
|
219
|
+
spec_verify_ct=recv_obj.spec_verify_ct,
|
220
|
+
input_token_logprobs_val=recv_obj.input_token_logprobs_val,
|
221
|
+
input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
|
222
|
+
output_token_logprobs_val=recv_obj.output_token_logprobs_val,
|
223
|
+
output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
|
224
|
+
input_top_logprobs_val=recv_obj.input_top_logprobs_val,
|
225
|
+
input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
|
226
|
+
output_top_logprobs_val=recv_obj.output_top_logprobs_val,
|
227
|
+
output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
|
228
|
+
input_token_ids_logprobs_val=recv_obj.input_token_ids_logprobs_val,
|
229
|
+
input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
|
230
|
+
output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
|
231
|
+
output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
|
232
|
+
output_hidden_states=recv_obj.output_hidden_states,
|
233
|
+
)
|
234
|
+
|
235
|
+
def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
|
236
|
+
raise NotImplementedError()
|
237
|
+
|
217
238
|
|
218
239
|
class LimitedCapacityDict(OrderedDict):
|
219
240
|
def __init__(self, capacity: int, *args, **kwargs):
|
@@ -232,6 +253,7 @@ def run_detokenizer_process(
|
|
232
253
|
server_args: ServerArgs,
|
233
254
|
port_args: PortArgs,
|
234
255
|
):
|
256
|
+
kill_itself_when_parent_died()
|
235
257
|
setproctitle.setproctitle("sglang::detokenizer")
|
236
258
|
configure_logger(server_args)
|
237
259
|
parent_process = psutil.Process().parent()
|
@@ -544,7 +544,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|
544
544
|
image_hashes = [image_hash]
|
545
545
|
image_sizes = [image_size]
|
546
546
|
image_grid_thws = [image_grid_thw]
|
547
|
-
elif isinstance(image_data, str):
|
547
|
+
elif isinstance(image_data, str) or isinstance(image_data, bytes):
|
548
548
|
# A single image
|
549
549
|
pixel_values, image_hash, image_size, image_grid_thw = (
|
550
550
|
await self._process_single_image(image_data)
|
@@ -553,6 +553,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|
553
553
|
image_sizes = [image_size]
|
554
554
|
image_grid_thws = [image_grid_thw]
|
555
555
|
else:
|
556
|
+
|
556
557
|
raise ValueError(f"Invalid image data: {image_data}")
|
557
558
|
|
558
559
|
return {
|
sglang/srt/managers/io_struct.py
CHANGED
@@ -16,10 +16,11 @@ The definition of objects transfered between different
|
|
16
16
|
processes (TokenizerManager, DetokenizerManager, Controller).
|
17
17
|
"""
|
18
18
|
|
19
|
+
import copy
|
19
20
|
import uuid
|
20
21
|
from dataclasses import dataclass, field
|
21
22
|
from enum import Enum
|
22
|
-
from typing import Dict, List, Optional, Union
|
23
|
+
from typing import Any, Dict, List, Optional, Union
|
23
24
|
|
24
25
|
from sglang.srt.managers.schedule_batch import BaseFinishReason
|
25
26
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
@@ -55,6 +56,8 @@ class GenerateReqInput:
|
|
55
56
|
logprob_start_len: Optional[Union[List[int], int]] = None
|
56
57
|
# If return logprobs, the number of top logprobs to return at each position.
|
57
58
|
top_logprobs_num: Optional[Union[List[int], int]] = None
|
59
|
+
# If return logprobs, the token ids to return logprob for.
|
60
|
+
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
|
58
61
|
# Whether to detokenize tokens in text in the returned logprobs.
|
59
62
|
return_text_in_logprobs: bool = False
|
60
63
|
# Whether to stream output.
|
@@ -69,11 +72,15 @@ class GenerateReqInput:
|
|
69
72
|
|
70
73
|
# Session info for continual prompting
|
71
74
|
session_params: Optional[Union[List[Dict], Dict]] = None
|
75
|
+
|
72
76
|
# Custom logit processor for advanced sampling control. Must be a serialized instance
|
73
77
|
# of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
|
74
78
|
# Use the processor's `to_str()` method to generate the serialized string.
|
75
79
|
custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
|
76
80
|
|
81
|
+
# Whether to return hidden states
|
82
|
+
return_hidden_states: bool = False
|
83
|
+
|
77
84
|
def normalize_batch_and_arguments(self):
|
78
85
|
if (
|
79
86
|
self.text is None and self.input_ids is None and self.input_embeds is None
|
@@ -142,6 +149,8 @@ class GenerateReqInput:
|
|
142
149
|
self.logprob_start_len = -1
|
143
150
|
if self.top_logprobs_num is None:
|
144
151
|
self.top_logprobs_num = 0
|
152
|
+
if not self.token_ids_logprob: # covers both None and []
|
153
|
+
self.token_ids_logprob = None
|
145
154
|
else:
|
146
155
|
if self.parallel_sample_num == 1:
|
147
156
|
num = self.batch_size
|
@@ -149,7 +158,7 @@ class GenerateReqInput:
|
|
149
158
|
# Expand parallel_sample_num
|
150
159
|
num = self.batch_size * self.parallel_sample_num
|
151
160
|
|
152
|
-
if self.image_data
|
161
|
+
if not self.image_data:
|
153
162
|
self.image_data = [None] * num
|
154
163
|
elif not isinstance(self.image_data, list):
|
155
164
|
self.image_data = [self.image_data] * num
|
@@ -187,6 +196,17 @@ class GenerateReqInput:
|
|
187
196
|
else:
|
188
197
|
assert self.parallel_sample_num == 1
|
189
198
|
|
199
|
+
if not self.token_ids_logprob: # covers both None and []
|
200
|
+
self.token_ids_logprob = [None] * num
|
201
|
+
elif not isinstance(self.token_ids_logprob, list):
|
202
|
+
self.token_ids_logprob = [[self.token_ids_logprob] for _ in range(num)]
|
203
|
+
elif not isinstance(self.token_ids_logprob[0], list):
|
204
|
+
self.token_ids_logprob = [
|
205
|
+
copy.deepcopy(self.token_ids_logprob) for _ in range(num)
|
206
|
+
]
|
207
|
+
else:
|
208
|
+
assert self.parallel_sample_num == 1
|
209
|
+
|
190
210
|
if self.custom_logit_processor is None:
|
191
211
|
self.custom_logit_processor = [None] * num
|
192
212
|
elif not isinstance(self.custom_logit_processor, list):
|
@@ -194,6 +214,12 @@ class GenerateReqInput:
|
|
194
214
|
else:
|
195
215
|
assert self.parallel_sample_num == 1
|
196
216
|
|
217
|
+
# Other checks
|
218
|
+
if self.session_params is not None:
|
219
|
+
assert isinstance(self.session_params, dict) or isinstance(
|
220
|
+
self.session_params[0], dict
|
221
|
+
)
|
222
|
+
|
197
223
|
def regenerate_rid(self):
|
198
224
|
self.rid = uuid.uuid4().hex
|
199
225
|
return self.rid
|
@@ -208,6 +234,7 @@ class GenerateReqInput:
|
|
208
234
|
return_logprob=self.return_logprob[i],
|
209
235
|
logprob_start_len=self.logprob_start_len[i],
|
210
236
|
top_logprobs_num=self.top_logprobs_num[i],
|
237
|
+
token_ids_logprob=self.token_ids_logprob[i],
|
211
238
|
return_text_in_logprobs=self.return_text_in_logprobs,
|
212
239
|
stream=self.stream,
|
213
240
|
log_metrics=self.log_metrics,
|
@@ -218,6 +245,7 @@ class GenerateReqInput:
|
|
218
245
|
if self.custom_logit_processor is not None
|
219
246
|
else None
|
220
247
|
),
|
248
|
+
return_hidden_states=self.return_hidden_states,
|
221
249
|
)
|
222
250
|
|
223
251
|
|
@@ -239,6 +267,8 @@ class TokenizedGenerateReqInput:
|
|
239
267
|
logprob_start_len: int
|
240
268
|
# If return logprobs, the number of top logprobs to return at each position.
|
241
269
|
top_logprobs_num: int
|
270
|
+
# If return logprobs, the token id to return logprob for
|
271
|
+
token_ids_logprob: List[int]
|
242
272
|
# Whether to stream output
|
243
273
|
stream: bool
|
244
274
|
|
@@ -255,6 +285,9 @@ class TokenizedGenerateReqInput:
|
|
255
285
|
# Use the processor's `to_str()` method to generate the serialized string.
|
256
286
|
custom_logit_processor: Optional[str] = None
|
257
287
|
|
288
|
+
# Whether to return hidden states
|
289
|
+
return_hidden_states: bool = False
|
290
|
+
|
258
291
|
|
259
292
|
@dataclass
|
260
293
|
class EmbeddingReqInput:
|
@@ -343,8 +376,6 @@ class BatchTokenIDOut:
|
|
343
376
|
# The finish reason
|
344
377
|
finished_reasons: List[BaseFinishReason]
|
345
378
|
# For incremental decoding
|
346
|
-
# The version id to sync decode status with in detokenizer_manager
|
347
|
-
vids: List[int]
|
348
379
|
decoded_texts: List[str]
|
349
380
|
decode_ids: List[int]
|
350
381
|
read_offsets: List[int]
|
@@ -370,10 +401,27 @@ class BatchTokenIDOut:
|
|
370
401
|
input_top_logprobs_idx: List[List]
|
371
402
|
output_top_logprobs_val: List[List]
|
372
403
|
output_top_logprobs_idx: List[List]
|
404
|
+
input_token_ids_logprobs_val: List[List]
|
405
|
+
input_token_ids_logprobs_idx: List[List]
|
406
|
+
output_token_ids_logprobs_val: List[List]
|
407
|
+
output_token_ids_logprobs_idx: List[List]
|
373
408
|
|
409
|
+
# Hidden states
|
374
410
|
output_hidden_states: List[List[float]]
|
375
411
|
|
376
412
|
|
413
|
+
@dataclass
|
414
|
+
class BatchMultimodalDecodeReq:
|
415
|
+
# The request id
|
416
|
+
rids: List[str]
|
417
|
+
finished_reasons: List[BaseFinishReason]
|
418
|
+
|
419
|
+
# Token counts
|
420
|
+
prompt_tokens: List[int]
|
421
|
+
completion_tokens: List[int]
|
422
|
+
cached_tokens: List[int]
|
423
|
+
|
424
|
+
|
377
425
|
@dataclass
|
378
426
|
class BatchStrOut:
|
379
427
|
# The request id
|
@@ -382,6 +430,8 @@ class BatchStrOut:
|
|
382
430
|
finished_reasons: List[dict]
|
383
431
|
# The output decoded strings
|
384
432
|
output_strs: List[str]
|
433
|
+
# The token ids
|
434
|
+
output_ids: Optional[List[int]]
|
385
435
|
|
386
436
|
# Token counts
|
387
437
|
prompt_tokens: List[int]
|
@@ -398,10 +448,30 @@ class BatchStrOut:
|
|
398
448
|
input_top_logprobs_idx: List[List]
|
399
449
|
output_top_logprobs_val: List[List]
|
400
450
|
output_top_logprobs_idx: List[List]
|
451
|
+
input_token_ids_logprobs_val: List[List]
|
452
|
+
input_token_ids_logprobs_idx: List[List]
|
453
|
+
output_token_ids_logprobs_val: List[List]
|
454
|
+
output_token_ids_logprobs_idx: List[List]
|
401
455
|
|
456
|
+
# Hidden states
|
402
457
|
output_hidden_states: List[List[float]]
|
403
458
|
|
404
459
|
|
460
|
+
@dataclass
|
461
|
+
class BatchMultimodalOut:
|
462
|
+
# The request id
|
463
|
+
rids: List[str]
|
464
|
+
# The finish reason
|
465
|
+
finished_reasons: List[dict]
|
466
|
+
# The outputs
|
467
|
+
outputs: List[List[Dict]]
|
468
|
+
|
469
|
+
# Token counts
|
470
|
+
prompt_tokens: List[int]
|
471
|
+
completion_tokens: List[int]
|
472
|
+
cached_tokens: List[int]
|
473
|
+
|
474
|
+
|
405
475
|
@dataclass
|
406
476
|
class BatchEmbeddingOut:
|
407
477
|
# The request id
|
@@ -431,6 +501,8 @@ class UpdateWeightFromDiskReqInput:
|
|
431
501
|
class UpdateWeightFromDiskReqOutput:
|
432
502
|
success: bool
|
433
503
|
message: str
|
504
|
+
# Number of paused requests during weight sync.
|
505
|
+
num_paused_requests: Optional[int] = 0
|
434
506
|
|
435
507
|
|
436
508
|
@dataclass
|
@@ -449,6 +521,8 @@ class UpdateWeightsFromDistributedReqOutput:
|
|
449
521
|
@dataclass
|
450
522
|
class UpdateWeightsFromTensorReqInput:
|
451
523
|
serialized_named_tensors: bytes # indeed Dict[str, torch.Tensor]
|
524
|
+
load_format: Optional[str]
|
525
|
+
flush_cache: bool
|
452
526
|
|
453
527
|
|
454
528
|
@dataclass
|
@@ -516,11 +590,57 @@ class AbortReq:
|
|
516
590
|
rid: str
|
517
591
|
|
518
592
|
|
519
|
-
|
593
|
+
@dataclass
|
594
|
+
class GetInternalStateReq:
|
595
|
+
pass
|
596
|
+
|
597
|
+
|
598
|
+
@dataclass
|
599
|
+
class GetInternalStateReqOutput:
|
600
|
+
internal_state: Dict[Any, Any]
|
601
|
+
|
602
|
+
|
603
|
+
@dataclass
|
604
|
+
class SetInternalStateReq:
|
605
|
+
server_args: Dict[str, Any]
|
606
|
+
|
607
|
+
|
608
|
+
@dataclass
|
609
|
+
class SetInternalStateReqOutput:
|
610
|
+
updated: bool
|
611
|
+
server_args: Dict[str, Any]
|
612
|
+
|
613
|
+
|
614
|
+
@dataclass
|
615
|
+
class ProfileReqInput:
|
616
|
+
# The output directory
|
617
|
+
output_dir: Optional[str] = None
|
618
|
+
# If set, it profile as many as this number of steps.
|
619
|
+
# If it is set, profiling is automatically stopped after this step, and
|
620
|
+
# the caller doesn't need to run stop_profile.
|
621
|
+
num_steps: Optional[int] = None
|
622
|
+
activities: Optional[List[str]] = None
|
623
|
+
|
624
|
+
|
625
|
+
class ProfileReqType(Enum):
|
520
626
|
START_PROFILE = 1
|
521
627
|
STOP_PROFILE = 2
|
522
628
|
|
523
629
|
|
630
|
+
@dataclass
|
631
|
+
class ProfileReq:
|
632
|
+
type: ProfileReqType
|
633
|
+
output_dir: Optional[str] = None
|
634
|
+
num_steps: Optional[int] = None
|
635
|
+
activities: Optional[List[str]] = None
|
636
|
+
|
637
|
+
|
638
|
+
@dataclass
|
639
|
+
class ProfileReqOutput:
|
640
|
+
success: bool
|
641
|
+
message: str
|
642
|
+
|
643
|
+
|
524
644
|
@dataclass
|
525
645
|
class ConfigureLoggingReq:
|
526
646
|
log_requests: Optional[bool] = None
|
@@ -546,6 +666,11 @@ class OpenSessionReqOutput:
|
|
546
666
|
success: bool
|
547
667
|
|
548
668
|
|
669
|
+
@dataclass
|
670
|
+
class HealthCheckOutput:
|
671
|
+
pass
|
672
|
+
|
673
|
+
|
549
674
|
@dataclass
|
550
675
|
class Function:
|
551
676
|
description: Optional[str] = None
|
@@ -560,7 +685,7 @@ class Tool:
|
|
560
685
|
|
561
686
|
|
562
687
|
@dataclass
|
563
|
-
class
|
688
|
+
class ParseFunctionCallReq:
|
564
689
|
text: str # The text to parse.
|
565
690
|
tools: List[Tool] = field(
|
566
691
|
default_factory=list
|
@@ -568,3 +693,15 @@ class FunctionCallReqInput:
|
|
568
693
|
tool_call_parser: Optional[str] = (
|
569
694
|
None # Specify the parser type, e.g. 'llama3', 'qwen25', or 'mistral'. If not specified, tries all.
|
570
695
|
)
|
696
|
+
|
697
|
+
|
698
|
+
@dataclass
|
699
|
+
class SeparateReasoningReqInput:
|
700
|
+
text: str # The text to parse.
|
701
|
+
reasoning_parser: str # Specify the parser type, e.g., "deepseek-r1".
|
702
|
+
|
703
|
+
|
704
|
+
@dataclass
|
705
|
+
class VertexGenerateReqInput:
|
706
|
+
instances: List[dict]
|
707
|
+
parameters: Optional[dict] = None
|