sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +302 -414
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +13 -8
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  63. sglang/srt/layers/moe/topk.py +13 -4
  64. sglang/srt/layers/quantization/__init__.py +111 -7
  65. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  66. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  69. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  71. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  72. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  73. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  80. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  82. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  86. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/fp8.py +69 -28
  89. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  90. sglang/srt/layers/quantization/gptq.py +416 -0
  91. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  92. sglang/srt/layers/quantization/int8_utils.py +73 -0
  93. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  94. sglang/srt/layers/radix_attention.py +1 -0
  95. sglang/srt/layers/rotary_embedding.py +0 -1
  96. sglang/srt/layers/sampler.py +76 -31
  97. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  98. sglang/srt/lora/lora.py +17 -1
  99. sglang/srt/lora/lora_config.py +5 -0
  100. sglang/srt/lora/lora_manager.py +1 -3
  101. sglang/srt/managers/cache_controller.py +193 -62
  102. sglang/srt/managers/configure_logging.py +2 -1
  103. sglang/srt/managers/data_parallel_controller.py +6 -2
  104. sglang/srt/managers/detokenizer_manager.py +124 -102
  105. sglang/srt/managers/image_processor.py +2 -1
  106. sglang/srt/managers/io_struct.py +144 -6
  107. sglang/srt/managers/schedule_batch.py +237 -197
  108. sglang/srt/managers/schedule_policy.py +29 -29
  109. sglang/srt/managers/scheduler.py +773 -334
  110. sglang/srt/managers/session_controller.py +6 -2
  111. sglang/srt/managers/tokenizer_manager.py +225 -68
  112. sglang/srt/managers/tp_worker.py +15 -4
  113. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  114. sglang/srt/mem_cache/chunk_cache.py +18 -11
  115. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  116. sglang/srt/mem_cache/memory_pool.py +68 -37
  117. sglang/srt/mem_cache/radix_cache.py +58 -47
  118. sglang/srt/metrics/collector.py +102 -36
  119. sglang/srt/model_executor/cuda_graph_runner.py +56 -31
  120. sglang/srt/model_executor/forward_batch_info.py +49 -16
  121. sglang/srt/model_executor/model_runner.py +280 -81
  122. sglang/srt/model_loader/loader.py +3 -3
  123. sglang/srt/model_loader/weight_utils.py +36 -14
  124. sglang/srt/models/baichuan.py +31 -6
  125. sglang/srt/models/chatglm.py +39 -7
  126. sglang/srt/models/commandr.py +29 -5
  127. sglang/srt/models/dbrx.py +31 -5
  128. sglang/srt/models/deepseek.py +43 -6
  129. sglang/srt/models/deepseek_nextn.py +32 -19
  130. sglang/srt/models/deepseek_v2.py +265 -32
  131. sglang/srt/models/exaone.py +19 -9
  132. sglang/srt/models/gemma.py +22 -8
  133. sglang/srt/models/gemma2.py +25 -12
  134. sglang/srt/models/gemma2_reward.py +5 -1
  135. sglang/srt/models/gpt2.py +28 -13
  136. sglang/srt/models/gpt_bigcode.py +27 -5
  137. sglang/srt/models/granite.py +21 -9
  138. sglang/srt/models/grok.py +21 -4
  139. sglang/srt/models/internlm2.py +36 -6
  140. sglang/srt/models/internlm2_reward.py +5 -1
  141. sglang/srt/models/llama.py +26 -9
  142. sglang/srt/models/llama_classification.py +5 -1
  143. sglang/srt/models/llama_eagle.py +17 -4
  144. sglang/srt/models/llama_embedding.py +5 -1
  145. sglang/srt/models/llama_reward.py +7 -2
  146. sglang/srt/models/llava.py +19 -3
  147. sglang/srt/models/llavavid.py +10 -1
  148. sglang/srt/models/minicpm.py +26 -2
  149. sglang/srt/models/minicpm3.py +39 -3
  150. sglang/srt/models/minicpmv.py +45 -14
  151. sglang/srt/models/mixtral.py +20 -9
  152. sglang/srt/models/mixtral_quant.py +50 -8
  153. sglang/srt/models/mllama.py +57 -11
  154. sglang/srt/models/olmo.py +34 -6
  155. sglang/srt/models/olmo2.py +34 -13
  156. sglang/srt/models/olmoe.py +26 -4
  157. sglang/srt/models/phi3_small.py +29 -10
  158. sglang/srt/models/qwen.py +26 -3
  159. sglang/srt/models/qwen2.py +26 -4
  160. sglang/srt/models/qwen2_5_vl.py +46 -8
  161. sglang/srt/models/qwen2_eagle.py +17 -5
  162. sglang/srt/models/qwen2_moe.py +44 -6
  163. sglang/srt/models/qwen2_rm.py +78 -0
  164. sglang/srt/models/qwen2_vl.py +39 -8
  165. sglang/srt/models/stablelm.py +32 -5
  166. sglang/srt/models/torch_native_llama.py +5 -2
  167. sglang/srt/models/xverse.py +21 -9
  168. sglang/srt/models/xverse_moe.py +45 -7
  169. sglang/srt/models/yivl.py +2 -1
  170. sglang/srt/openai_api/adapter.py +109 -24
  171. sglang/srt/openai_api/protocol.py +17 -1
  172. sglang/srt/reasoning_parser.py +154 -0
  173. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  174. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  175. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  176. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  177. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  178. sglang/srt/sampling/sampling_batch_info.py +79 -157
  179. sglang/srt/sampling/sampling_params.py +16 -13
  180. sglang/srt/server_args.py +135 -60
  181. sglang/srt/speculative/build_eagle_tree.py +8 -9
  182. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
  183. sglang/srt/speculative/eagle_utils.py +92 -57
  184. sglang/srt/speculative/eagle_worker.py +238 -111
  185. sglang/srt/speculative/spec_info.py +1 -13
  186. sglang/srt/utils.py +43 -17
  187. sglang/srt/warmup.py +47 -0
  188. sglang/test/few_shot_gsm8k.py +4 -1
  189. sglang/test/runners.py +389 -126
  190. sglang/test/send_one.py +88 -0
  191. sglang/test/test_block_fp8_ep.py +361 -0
  192. sglang/test/test_programs.py +1 -1
  193. sglang/test/test_utils.py +138 -84
  194. sglang/utils.py +50 -60
  195. sglang/version.py +1 -1
  196. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
  197. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
  198. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
  199. sglang/bench_latency.py +0 -1
  200. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  201. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  202. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  203. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  204. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
  205. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@
14
14
  """DetokenizerManager is a process that detokenizes the token ids."""
15
15
 
16
16
  import dataclasses
17
+ import json
17
18
  import logging
18
19
  import os
19
20
  import signal
@@ -27,12 +28,21 @@ import zmq
27
28
  from sglang.srt.hf_transformers_utils import get_tokenizer
28
29
  from sglang.srt.managers.io_struct import (
29
30
  BatchEmbeddingOut,
31
+ BatchMultimodalDecodeReq,
30
32
  BatchStrOut,
31
33
  BatchTokenIDOut,
32
34
  )
33
35
  from sglang.srt.server_args import PortArgs, ServerArgs
34
- from sglang.srt.utils import configure_logger, get_zmq_socket
35
- from sglang.utils import find_printable_text, get_exception_traceback
36
+ from sglang.srt.utils import (
37
+ configure_logger,
38
+ get_zmq_socket,
39
+ kill_itself_when_parent_died,
40
+ )
41
+ from sglang.utils import (
42
+ TypeBasedDispatcher,
43
+ find_printable_text,
44
+ get_exception_traceback,
45
+ )
36
46
 
37
47
  logger = logging.getLogger(__name__)
38
48
 
@@ -47,7 +57,6 @@ DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 <
47
57
  class DecodeStatus:
48
58
  """Store the status of incremental decoding."""
49
59
 
50
- vid: int
51
60
  decoded_text: str
52
61
  decode_ids: List[int]
53
62
  surr_offset: int
@@ -82,6 +91,22 @@ class DetokenizerManager:
82
91
  )
83
92
 
84
93
  self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
94
+ self.is_dummy = server_args.load_format == "dummy"
95
+
96
+ self._request_dispatcher = TypeBasedDispatcher(
97
+ [
98
+ (BatchEmbeddingOut, self.handle_batch_embedding_out),
99
+ (BatchTokenIDOut, self.handle_batch_token_id_out),
100
+ (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
101
+ ]
102
+ )
103
+
104
+ def event_loop(self):
105
+ """The event loop that handles requests"""
106
+ while True:
107
+ recv_obj = self.recv_from_scheduler.recv_pyobj()
108
+ output = self._request_dispatcher(recv_obj)
109
+ self.send_to_tokenizer.send_pyobj(output)
85
110
 
86
111
  def trim_matched_stop(
87
112
  self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
@@ -106,114 +131,110 @@ class DetokenizerManager:
106
131
  return output[:-1]
107
132
  return output
108
133
 
109
- def event_loop(self):
110
- """The event loop that handles requests"""
111
-
112
- while True:
113
- recv_obj = self.recv_from_scheduler.recv_pyobj()
114
-
115
- if isinstance(recv_obj, BatchEmbeddingOut):
116
- # If it is embedding model, no detokenization is needed.
117
- self.send_to_tokenizer.send_pyobj(recv_obj)
118
- continue
134
+ def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
135
+ # If it is embedding model, no detokenization is needed.
136
+ return recv_obj
137
+
138
+ def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
139
+ bs = len(recv_obj.rids)
140
+
141
+ # Initialize decode status
142
+ read_ids, surr_ids = [], []
143
+ for i in range(bs):
144
+ rid = recv_obj.rids[i]
145
+ if rid not in self.decode_status:
146
+ s = DecodeStatus(
147
+ decoded_text=recv_obj.decoded_texts[i],
148
+ decode_ids=recv_obj.decode_ids[i],
149
+ surr_offset=0,
150
+ read_offset=recv_obj.read_offsets[i],
151
+ )
152
+ self.decode_status[rid] = s
119
153
  else:
120
- assert isinstance(recv_obj, BatchTokenIDOut)
121
-
122
- bs = len(recv_obj.rids)
123
-
124
- # Initialize decode status
125
- read_ids, surr_ids = [], []
126
- for i in range(bs):
127
- rid = recv_obj.rids[i]
128
- vid = recv_obj.vids[i]
129
- if rid not in self.decode_status or self.decode_status[rid].vid != vid:
130
- s = DecodeStatus(
131
- vid=vid,
132
- decoded_text=recv_obj.decoded_texts[i],
133
- decode_ids=recv_obj.decode_ids[i],
134
- surr_offset=0,
135
- read_offset=recv_obj.read_offsets[i],
136
- )
137
- self.decode_status[rid] = s
138
- else:
139
- s = self.decode_status[rid]
140
- s.decode_ids = recv_obj.decode_ids[i]
141
-
142
- read_ids.append(
143
- self.trim_matched_stop(
144
- s.decode_ids[s.surr_offset :],
145
- recv_obj.finished_reasons[i],
146
- recv_obj.no_stop_trim[i],
147
- )
154
+ s = self.decode_status[rid]
155
+ s.decode_ids = recv_obj.decode_ids[i]
156
+
157
+ read_ids.append(
158
+ self.trim_matched_stop(
159
+ s.decode_ids[s.surr_offset :],
160
+ recv_obj.finished_reasons[i],
161
+ recv_obj.no_stop_trim[i],
148
162
  )
149
- surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
150
-
151
- # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
152
- surr_texts = self.tokenizer.batch_decode(
153
- surr_ids,
154
- skip_special_tokens=recv_obj.skip_special_tokens[0],
155
- spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
156
- )
157
- read_texts = self.tokenizer.batch_decode(
158
- read_ids,
159
- skip_special_tokens=recv_obj.skip_special_tokens[0],
160
- spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
161
163
  )
164
+ surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
165
+
166
+ # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
167
+ surr_texts = self.tokenizer.batch_decode(
168
+ surr_ids,
169
+ skip_special_tokens=recv_obj.skip_special_tokens[0],
170
+ spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
171
+ )
172
+ read_texts = self.tokenizer.batch_decode(
173
+ read_ids,
174
+ skip_special_tokens=recv_obj.skip_special_tokens[0],
175
+ spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
176
+ )
162
177
 
163
- # Incremental decoding
164
- output_strs = []
165
- for i in range(bs):
166
- try:
167
- s = self.decode_status[recv_obj.rids[i]]
168
- except KeyError:
169
- raise RuntimeError(
170
- f"Decode status not found for request {recv_obj.rids[i]}. "
171
- "It may be due to the request being evicted from the decode status due to memory pressure. "
172
- "Please increase the maximum number of requests by setting "
173
- "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
174
- f"The current value is {DETOKENIZER_MAX_STATES}. "
175
- "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
176
- )
177
- new_text = read_texts[i][len(surr_texts[i]) :]
178
- if recv_obj.finished_reasons[i] is None:
179
- # Streaming chunk: update the decode status
180
- if len(new_text) > 0 and not new_text.endswith("�"):
181
- s.decoded_text = s.decoded_text + new_text
182
- s.surr_offset = s.read_offset
183
- s.read_offset = len(s.decode_ids)
184
- new_text = ""
185
- else:
186
- new_text = find_printable_text(new_text)
187
-
188
- output_strs.append(
189
- self.trim_matched_stop(
190
- s.decoded_text + new_text,
191
- recv_obj.finished_reasons[i],
192
- recv_obj.no_stop_trim[i],
193
- )
178
+ # Incremental decoding
179
+ output_strs = []
180
+ for i in range(bs):
181
+ try:
182
+ s = self.decode_status[recv_obj.rids[i]]
183
+ except KeyError:
184
+ raise RuntimeError(
185
+ f"Decode status not found for request {recv_obj.rids[i]}. "
186
+ "It may be due to the request being evicted from the decode status due to memory pressure. "
187
+ "Please increase the maximum number of requests by setting "
188
+ "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
189
+ f"The current value is {DETOKENIZER_MAX_STATES}. "
190
+ "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
194
191
  )
192
+ new_text = read_texts[i][len(surr_texts[i]) :]
193
+ if recv_obj.finished_reasons[i] is None:
194
+ # Streaming chunk: update the decode status
195
+ if len(new_text) > 0 and not new_text.endswith("�"):
196
+ s.decoded_text = s.decoded_text + new_text
197
+ s.surr_offset = s.read_offset
198
+ s.read_offset = len(s.decode_ids)
199
+ new_text = ""
200
+ else:
201
+ new_text = find_printable_text(new_text)
195
202
 
196
- self.send_to_tokenizer.send_pyobj(
197
- BatchStrOut(
198
- rids=recv_obj.rids,
199
- finished_reasons=recv_obj.finished_reasons,
200
- output_strs=output_strs,
201
- prompt_tokens=recv_obj.prompt_tokens,
202
- completion_tokens=recv_obj.completion_tokens,
203
- cached_tokens=recv_obj.cached_tokens,
204
- spec_verify_ct=recv_obj.spec_verify_ct,
205
- input_token_logprobs_val=recv_obj.input_token_logprobs_val,
206
- input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
207
- output_token_logprobs_val=recv_obj.output_token_logprobs_val,
208
- output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
209
- input_top_logprobs_val=recv_obj.input_top_logprobs_val,
210
- input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
211
- output_top_logprobs_val=recv_obj.output_top_logprobs_val,
212
- output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
213
- output_hidden_states=recv_obj.output_hidden_states,
203
+ output_strs.append(
204
+ self.trim_matched_stop(
205
+ s.decoded_text + new_text,
206
+ recv_obj.finished_reasons[i],
207
+ recv_obj.no_stop_trim[i],
214
208
  )
215
209
  )
216
210
 
211
+ return BatchStrOut(
212
+ rids=recv_obj.rids,
213
+ finished_reasons=recv_obj.finished_reasons,
214
+ output_strs=output_strs,
215
+ output_ids=None,
216
+ prompt_tokens=recv_obj.prompt_tokens,
217
+ completion_tokens=recv_obj.completion_tokens,
218
+ cached_tokens=recv_obj.cached_tokens,
219
+ spec_verify_ct=recv_obj.spec_verify_ct,
220
+ input_token_logprobs_val=recv_obj.input_token_logprobs_val,
221
+ input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
222
+ output_token_logprobs_val=recv_obj.output_token_logprobs_val,
223
+ output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
224
+ input_top_logprobs_val=recv_obj.input_top_logprobs_val,
225
+ input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
226
+ output_top_logprobs_val=recv_obj.output_top_logprobs_val,
227
+ output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
228
+ input_token_ids_logprobs_val=recv_obj.input_token_ids_logprobs_val,
229
+ input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
230
+ output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
231
+ output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
232
+ output_hidden_states=recv_obj.output_hidden_states,
233
+ )
234
+
235
+ def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
236
+ raise NotImplementedError()
237
+
217
238
 
218
239
  class LimitedCapacityDict(OrderedDict):
219
240
  def __init__(self, capacity: int, *args, **kwargs):
@@ -232,6 +253,7 @@ def run_detokenizer_process(
232
253
  server_args: ServerArgs,
233
254
  port_args: PortArgs,
234
255
  ):
256
+ kill_itself_when_parent_died()
235
257
  setproctitle.setproctitle("sglang::detokenizer")
236
258
  configure_logger(server_args)
237
259
  parent_process = psutil.Process().parent()
@@ -544,7 +544,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
544
544
  image_hashes = [image_hash]
545
545
  image_sizes = [image_size]
546
546
  image_grid_thws = [image_grid_thw]
547
- elif isinstance(image_data, str):
547
+ elif isinstance(image_data, str) or isinstance(image_data, bytes):
548
548
  # A single image
549
549
  pixel_values, image_hash, image_size, image_grid_thw = (
550
550
  await self._process_single_image(image_data)
@@ -553,6 +553,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
553
553
  image_sizes = [image_size]
554
554
  image_grid_thws = [image_grid_thw]
555
555
  else:
556
+
556
557
  raise ValueError(f"Invalid image data: {image_data}")
557
558
 
558
559
  return {
@@ -16,10 +16,11 @@ The definition of objects transfered between different
16
16
  processes (TokenizerManager, DetokenizerManager, Controller).
17
17
  """
18
18
 
19
+ import copy
19
20
  import uuid
20
21
  from dataclasses import dataclass, field
21
22
  from enum import Enum
22
- from typing import Dict, List, Optional, Union
23
+ from typing import Any, Dict, List, Optional, Union
23
24
 
24
25
  from sglang.srt.managers.schedule_batch import BaseFinishReason
25
26
  from sglang.srt.sampling.sampling_params import SamplingParams
@@ -55,6 +56,8 @@ class GenerateReqInput:
55
56
  logprob_start_len: Optional[Union[List[int], int]] = None
56
57
  # If return logprobs, the number of top logprobs to return at each position.
57
58
  top_logprobs_num: Optional[Union[List[int], int]] = None
59
+ # If return logprobs, the token ids to return logprob for.
60
+ token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
58
61
  # Whether to detokenize tokens in text in the returned logprobs.
59
62
  return_text_in_logprobs: bool = False
60
63
  # Whether to stream output.
@@ -69,11 +72,15 @@ class GenerateReqInput:
69
72
 
70
73
  # Session info for continual prompting
71
74
  session_params: Optional[Union[List[Dict], Dict]] = None
75
+
72
76
  # Custom logit processor for advanced sampling control. Must be a serialized instance
73
77
  # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
74
78
  # Use the processor's `to_str()` method to generate the serialized string.
75
79
  custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
76
80
 
81
+ # Whether to return hidden states
82
+ return_hidden_states: bool = False
83
+
77
84
  def normalize_batch_and_arguments(self):
78
85
  if (
79
86
  self.text is None and self.input_ids is None and self.input_embeds is None
@@ -142,6 +149,8 @@ class GenerateReqInput:
142
149
  self.logprob_start_len = -1
143
150
  if self.top_logprobs_num is None:
144
151
  self.top_logprobs_num = 0
152
+ if not self.token_ids_logprob: # covers both None and []
153
+ self.token_ids_logprob = None
145
154
  else:
146
155
  if self.parallel_sample_num == 1:
147
156
  num = self.batch_size
@@ -149,7 +158,7 @@ class GenerateReqInput:
149
158
  # Expand parallel_sample_num
150
159
  num = self.batch_size * self.parallel_sample_num
151
160
 
152
- if self.image_data is None:
161
+ if not self.image_data:
153
162
  self.image_data = [None] * num
154
163
  elif not isinstance(self.image_data, list):
155
164
  self.image_data = [self.image_data] * num
@@ -187,6 +196,17 @@ class GenerateReqInput:
187
196
  else:
188
197
  assert self.parallel_sample_num == 1
189
198
 
199
+ if not self.token_ids_logprob: # covers both None and []
200
+ self.token_ids_logprob = [None] * num
201
+ elif not isinstance(self.token_ids_logprob, list):
202
+ self.token_ids_logprob = [[self.token_ids_logprob] for _ in range(num)]
203
+ elif not isinstance(self.token_ids_logprob[0], list):
204
+ self.token_ids_logprob = [
205
+ copy.deepcopy(self.token_ids_logprob) for _ in range(num)
206
+ ]
207
+ else:
208
+ assert self.parallel_sample_num == 1
209
+
190
210
  if self.custom_logit_processor is None:
191
211
  self.custom_logit_processor = [None] * num
192
212
  elif not isinstance(self.custom_logit_processor, list):
@@ -194,6 +214,12 @@ class GenerateReqInput:
194
214
  else:
195
215
  assert self.parallel_sample_num == 1
196
216
 
217
+ # Other checks
218
+ if self.session_params is not None:
219
+ assert isinstance(self.session_params, dict) or isinstance(
220
+ self.session_params[0], dict
221
+ )
222
+
197
223
  def regenerate_rid(self):
198
224
  self.rid = uuid.uuid4().hex
199
225
  return self.rid
@@ -208,6 +234,7 @@ class GenerateReqInput:
208
234
  return_logprob=self.return_logprob[i],
209
235
  logprob_start_len=self.logprob_start_len[i],
210
236
  top_logprobs_num=self.top_logprobs_num[i],
237
+ token_ids_logprob=self.token_ids_logprob[i],
211
238
  return_text_in_logprobs=self.return_text_in_logprobs,
212
239
  stream=self.stream,
213
240
  log_metrics=self.log_metrics,
@@ -218,6 +245,7 @@ class GenerateReqInput:
218
245
  if self.custom_logit_processor is not None
219
246
  else None
220
247
  ),
248
+ return_hidden_states=self.return_hidden_states,
221
249
  )
222
250
 
223
251
 
@@ -239,6 +267,8 @@ class TokenizedGenerateReqInput:
239
267
  logprob_start_len: int
240
268
  # If return logprobs, the number of top logprobs to return at each position.
241
269
  top_logprobs_num: int
270
+ # If return logprobs, the token id to return logprob for
271
+ token_ids_logprob: List[int]
242
272
  # Whether to stream output
243
273
  stream: bool
244
274
 
@@ -255,6 +285,9 @@ class TokenizedGenerateReqInput:
255
285
  # Use the processor's `to_str()` method to generate the serialized string.
256
286
  custom_logit_processor: Optional[str] = None
257
287
 
288
+ # Whether to return hidden states
289
+ return_hidden_states: bool = False
290
+
258
291
 
259
292
  @dataclass
260
293
  class EmbeddingReqInput:
@@ -343,8 +376,6 @@ class BatchTokenIDOut:
343
376
  # The finish reason
344
377
  finished_reasons: List[BaseFinishReason]
345
378
  # For incremental decoding
346
- # The version id to sync decode status with in detokenizer_manager
347
- vids: List[int]
348
379
  decoded_texts: List[str]
349
380
  decode_ids: List[int]
350
381
  read_offsets: List[int]
@@ -370,10 +401,27 @@ class BatchTokenIDOut:
370
401
  input_top_logprobs_idx: List[List]
371
402
  output_top_logprobs_val: List[List]
372
403
  output_top_logprobs_idx: List[List]
404
+ input_token_ids_logprobs_val: List[List]
405
+ input_token_ids_logprobs_idx: List[List]
406
+ output_token_ids_logprobs_val: List[List]
407
+ output_token_ids_logprobs_idx: List[List]
373
408
 
409
+ # Hidden states
374
410
  output_hidden_states: List[List[float]]
375
411
 
376
412
 
413
+ @dataclass
414
+ class BatchMultimodalDecodeReq:
415
+ # The request id
416
+ rids: List[str]
417
+ finished_reasons: List[BaseFinishReason]
418
+
419
+ # Token counts
420
+ prompt_tokens: List[int]
421
+ completion_tokens: List[int]
422
+ cached_tokens: List[int]
423
+
424
+
377
425
  @dataclass
378
426
  class BatchStrOut:
379
427
  # The request id
@@ -382,6 +430,8 @@ class BatchStrOut:
382
430
  finished_reasons: List[dict]
383
431
  # The output decoded strings
384
432
  output_strs: List[str]
433
+ # The token ids
434
+ output_ids: Optional[List[int]]
385
435
 
386
436
  # Token counts
387
437
  prompt_tokens: List[int]
@@ -398,10 +448,30 @@ class BatchStrOut:
398
448
  input_top_logprobs_idx: List[List]
399
449
  output_top_logprobs_val: List[List]
400
450
  output_top_logprobs_idx: List[List]
451
+ input_token_ids_logprobs_val: List[List]
452
+ input_token_ids_logprobs_idx: List[List]
453
+ output_token_ids_logprobs_val: List[List]
454
+ output_token_ids_logprobs_idx: List[List]
401
455
 
456
+ # Hidden states
402
457
  output_hidden_states: List[List[float]]
403
458
 
404
459
 
460
+ @dataclass
461
+ class BatchMultimodalOut:
462
+ # The request id
463
+ rids: List[str]
464
+ # The finish reason
465
+ finished_reasons: List[dict]
466
+ # The outputs
467
+ outputs: List[List[Dict]]
468
+
469
+ # Token counts
470
+ prompt_tokens: List[int]
471
+ completion_tokens: List[int]
472
+ cached_tokens: List[int]
473
+
474
+
405
475
  @dataclass
406
476
  class BatchEmbeddingOut:
407
477
  # The request id
@@ -412,6 +482,7 @@ class BatchEmbeddingOut:
412
482
  embeddings: List[List[float]]
413
483
  # Token counts
414
484
  prompt_tokens: List[int]
485
+ cached_tokens: List[int]
415
486
 
416
487
 
417
488
  @dataclass
@@ -431,6 +502,8 @@ class UpdateWeightFromDiskReqInput:
431
502
  class UpdateWeightFromDiskReqOutput:
432
503
  success: bool
433
504
  message: str
505
+ # Number of paused requests during weight sync.
506
+ num_paused_requests: Optional[int] = 0
434
507
 
435
508
 
436
509
  @dataclass
@@ -449,6 +522,8 @@ class UpdateWeightsFromDistributedReqOutput:
449
522
  @dataclass
450
523
  class UpdateWeightsFromTensorReqInput:
451
524
  serialized_named_tensors: bytes # indeed Dict[str, torch.Tensor]
525
+ load_format: Optional[str]
526
+ flush_cache: bool
452
527
 
453
528
 
454
529
  @dataclass
@@ -516,11 +591,57 @@ class AbortReq:
516
591
  rid: str
517
592
 
518
593
 
519
- class ProfileReq(Enum):
594
+ @dataclass
595
+ class GetInternalStateReq:
596
+ pass
597
+
598
+
599
+ @dataclass
600
+ class GetInternalStateReqOutput:
601
+ internal_state: Dict[Any, Any]
602
+
603
+
604
+ @dataclass
605
+ class SetInternalStateReq:
606
+ server_args: Dict[str, Any]
607
+
608
+
609
+ @dataclass
610
+ class SetInternalStateReqOutput:
611
+ updated: bool
612
+ server_args: Dict[str, Any]
613
+
614
+
615
+ @dataclass
616
+ class ProfileReqInput:
617
+ # The output directory
618
+ output_dir: Optional[str] = None
619
+ # If set, it profile as many as this number of steps.
620
+ # If it is set, profiling is automatically stopped after this step, and
621
+ # the caller doesn't need to run stop_profile.
622
+ num_steps: Optional[int] = None
623
+ activities: Optional[List[str]] = None
624
+
625
+
626
+ class ProfileReqType(Enum):
520
627
  START_PROFILE = 1
521
628
  STOP_PROFILE = 2
522
629
 
523
630
 
631
+ @dataclass
632
+ class ProfileReq:
633
+ type: ProfileReqType
634
+ output_dir: Optional[str] = None
635
+ num_steps: Optional[int] = None
636
+ activities: Optional[List[str]] = None
637
+
638
+
639
+ @dataclass
640
+ class ProfileReqOutput:
641
+ success: bool
642
+ message: str
643
+
644
+
524
645
  @dataclass
525
646
  class ConfigureLoggingReq:
526
647
  log_requests: Optional[bool] = None
@@ -546,6 +667,11 @@ class OpenSessionReqOutput:
546
667
  success: bool
547
668
 
548
669
 
670
+ @dataclass
671
+ class HealthCheckOutput:
672
+ pass
673
+
674
+
549
675
  @dataclass
550
676
  class Function:
551
677
  description: Optional[str] = None
@@ -560,7 +686,7 @@ class Tool:
560
686
 
561
687
 
562
688
  @dataclass
563
- class FunctionCallReqInput:
689
+ class ParseFunctionCallReq:
564
690
  text: str # The text to parse.
565
691
  tools: List[Tool] = field(
566
692
  default_factory=list
@@ -568,3 +694,15 @@ class FunctionCallReqInput:
568
694
  tool_call_parser: Optional[str] = (
569
695
  None # Specify the parser type, e.g. 'llama3', 'qwen25', or 'mistral'. If not specified, tries all.
570
696
  )
697
+
698
+
699
+ @dataclass
700
+ class SeparateReasoningReqInput:
701
+ text: str # The text to parse.
702
+ reasoning_parser: str # Specify the parser type, e.g., "deepseek-r1".
703
+
704
+
705
+ @dataclass
706
+ class VertexGenerateReqInput:
707
+ instances: List[dict]
708
+ parameters: Optional[dict] = None