sglang 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. sglang/__init__.py +2 -4
  2. sglang/bench_one_batch.py +23 -2
  3. sglang/bench_serving.py +6 -4
  4. sglang/lang/backend/anthropic.py +0 -4
  5. sglang/lang/backend/base_backend.py +1 -1
  6. sglang/lang/backend/openai.py +1 -1
  7. sglang/lang/backend/vertexai.py +0 -1
  8. sglang/lang/compiler.py +1 -7
  9. sglang/lang/tracer.py +3 -7
  10. sglang/srt/_custom_ops.py +0 -2
  11. sglang/srt/configs/model_config.py +37 -5
  12. sglang/srt/constrained/base_grammar_backend.py +26 -5
  13. sglang/srt/constrained/llguidance_backend.py +1 -0
  14. sglang/srt/constrained/outlines_backend.py +1 -0
  15. sglang/srt/constrained/outlines_jump_forward.py +14 -1
  16. sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  17. sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
  18. sglang/srt/constrained/xgrammar_backend.py +27 -4
  19. sglang/srt/custom_op.py +0 -62
  20. sglang/srt/disaggregation/base/__init__.py +8 -0
  21. sglang/srt/disaggregation/base/conn.py +113 -0
  22. sglang/srt/disaggregation/decode.py +80 -11
  23. sglang/srt/disaggregation/mini_lb.py +58 -123
  24. sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  25. sglang/srt/disaggregation/mooncake/conn.py +585 -0
  26. sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
  27. sglang/srt/disaggregation/prefill.py +82 -22
  28. sglang/srt/disaggregation/utils.py +46 -0
  29. sglang/srt/entrypoints/EngineBase.py +53 -0
  30. sglang/srt/entrypoints/engine.py +36 -8
  31. sglang/srt/entrypoints/http_server.py +37 -8
  32. sglang/srt/entrypoints/http_server_engine.py +142 -0
  33. sglang/srt/entrypoints/verl_engine.py +42 -13
  34. sglang/srt/hf_transformers_utils.py +4 -0
  35. sglang/srt/layers/activation.py +6 -8
  36. sglang/srt/layers/attention/flashattention_backend.py +430 -257
  37. sglang/srt/layers/attention/flashinfer_backend.py +18 -9
  38. sglang/srt/layers/attention/torch_native_backend.py +6 -1
  39. sglang/srt/layers/attention/triton_backend.py +6 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
  41. sglang/srt/layers/attention/vision.py +1 -1
  42. sglang/srt/layers/dp_attention.py +2 -4
  43. sglang/srt/layers/elementwise.py +15 -2
  44. sglang/srt/layers/layernorm.py +1 -1
  45. sglang/srt/layers/linear.py +18 -3
  46. sglang/srt/layers/moe/ep_moe/layer.py +15 -29
  47. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  48. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
  56. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  57. sglang/srt/layers/moe/router.py +7 -1
  58. sglang/srt/layers/moe/topk.py +63 -45
  59. sglang/srt/layers/parameter.py +0 -2
  60. sglang/srt/layers/quantization/__init__.py +13 -5
  61. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  62. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
  63. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
  64. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
  65. sglang/srt/layers/quantization/fp8.py +131 -136
  66. sglang/srt/layers/quantization/fp8_kernel.py +328 -46
  67. sglang/srt/layers/quantization/fp8_utils.py +206 -253
  68. sglang/srt/layers/quantization/kv_cache.py +43 -52
  69. sglang/srt/layers/quantization/modelopt_quant.py +271 -4
  70. sglang/srt/layers/quantization/moe_wna16.py +2 -0
  71. sglang/srt/layers/quantization/utils.py +5 -11
  72. sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
  73. sglang/srt/layers/quantization/w8a8_int8.py +8 -7
  74. sglang/srt/layers/radix_attention.py +28 -1
  75. sglang/srt/layers/rotary_embedding.py +15 -3
  76. sglang/srt/layers/sampler.py +5 -10
  77. sglang/srt/lora/backend/base_backend.py +18 -2
  78. sglang/srt/lora/backend/flashinfer_backend.py +1 -1
  79. sglang/srt/lora/backend/triton_backend.py +1 -1
  80. sglang/srt/lora/layers.py +1 -1
  81. sglang/srt/lora/lora.py +1 -1
  82. sglang/srt/lora/lora_manager.py +1 -1
  83. sglang/srt/managers/detokenizer_manager.py +0 -1
  84. sglang/srt/managers/io_struct.py +255 -97
  85. sglang/srt/managers/mm_utils.py +7 -5
  86. sglang/srt/managers/multimodal_processor.py +0 -2
  87. sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
  88. sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  89. sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  90. sglang/srt/managers/schedule_batch.py +64 -25
  91. sglang/srt/managers/scheduler.py +80 -82
  92. sglang/srt/managers/tokenizer_manager.py +18 -3
  93. sglang/srt/managers/tp_worker.py +1 -0
  94. sglang/srt/mem_cache/hiradix_cache.py +5 -1
  95. sglang/srt/mem_cache/memory_pool.py +21 -3
  96. sglang/srt/metrics/collector.py +9 -0
  97. sglang/srt/model_executor/cuda_graph_runner.py +9 -6
  98. sglang/srt/model_executor/forward_batch_info.py +234 -15
  99. sglang/srt/model_executor/model_runner.py +67 -35
  100. sglang/srt/model_loader/loader.py +31 -4
  101. sglang/srt/model_loader/weight_utils.py +4 -2
  102. sglang/srt/models/baichuan.py +2 -0
  103. sglang/srt/models/bert.py +398 -0
  104. sglang/srt/models/chatglm.py +1 -0
  105. sglang/srt/models/commandr.py +1 -0
  106. sglang/srt/models/dbrx.py +1 -0
  107. sglang/srt/models/deepseek.py +2 -1
  108. sglang/srt/models/deepseek_nextn.py +74 -70
  109. sglang/srt/models/deepseek_v2.py +494 -366
  110. sglang/srt/models/exaone.py +1 -0
  111. sglang/srt/models/gemma.py +1 -0
  112. sglang/srt/models/gemma2.py +1 -0
  113. sglang/srt/models/gemma3_causal.py +1 -0
  114. sglang/srt/models/gpt2.py +1 -0
  115. sglang/srt/models/gpt_bigcode.py +1 -0
  116. sglang/srt/models/granite.py +1 -0
  117. sglang/srt/models/grok.py +1 -0
  118. sglang/srt/models/internlm2.py +1 -0
  119. sglang/srt/models/llama.py +6 -5
  120. sglang/srt/models/llama4.py +101 -34
  121. sglang/srt/models/minicpm.py +1 -0
  122. sglang/srt/models/minicpm3.py +30 -200
  123. sglang/srt/models/mixtral.py +1 -0
  124. sglang/srt/models/mixtral_quant.py +1 -0
  125. sglang/srt/models/mllama.py +51 -8
  126. sglang/srt/models/mllama4.py +102 -29
  127. sglang/srt/models/olmo.py +1 -0
  128. sglang/srt/models/olmo2.py +1 -0
  129. sglang/srt/models/olmoe.py +1 -0
  130. sglang/srt/models/phi3_small.py +1 -0
  131. sglang/srt/models/qwen.py +1 -0
  132. sglang/srt/models/qwen2.py +5 -1
  133. sglang/srt/models/qwen2_5_vl.py +35 -70
  134. sglang/srt/models/qwen2_moe.py +15 -13
  135. sglang/srt/models/qwen2_vl.py +27 -25
  136. sglang/srt/models/qwen3.py +335 -0
  137. sglang/srt/models/qwen3_moe.py +423 -0
  138. sglang/srt/models/stablelm.py +1 -0
  139. sglang/srt/models/xverse.py +1 -0
  140. sglang/srt/models/xverse_moe.py +1 -0
  141. sglang/srt/openai_api/adapter.py +4 -1
  142. sglang/srt/patch_torch.py +11 -0
  143. sglang/srt/reasoning_parser.py +0 -1
  144. sglang/srt/sampling/sampling_batch_info.py +2 -3
  145. sglang/srt/server_args.py +55 -19
  146. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  147. sglang/srt/speculative/eagle_utils.py +1 -11
  148. sglang/srt/speculative/eagle_worker.py +10 -9
  149. sglang/srt/utils.py +136 -10
  150. sglang/test/attention/test_flashattn_backend.py +259 -221
  151. sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  152. sglang/test/attention/test_prefix_chunk_info.py +224 -0
  153. sglang/test/runners.py +5 -1
  154. sglang/test/test_block_fp8.py +224 -0
  155. sglang/test/test_custom_ops.py +1 -1
  156. sglang/test/test_utils.py +19 -8
  157. sglang/version.py +1 -1
  158. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +15 -5
  159. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +162 -147
  160. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
  161. sglang/lang/__init__.py +0 -0
  162. sglang/srt/disaggregation/conn.py +0 -81
  163. sglang/srt/lora/backend/__init__.py +0 -25
  164. sglang/srt/server.py +0 -18
  165. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
  166. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  """
2
- Minimal HTTP load balancer for prefill and decode servers for testing purpose.
2
+ Minimal HTTP load balancer for prefill and decode servers for testing.
3
3
  """
4
4
 
5
5
  import asyncio
@@ -22,64 +22,63 @@ class MiniLoadBalancer:
22
22
  def select_pair(self):
23
23
  return random.choice(self.prefill_servers), random.choice(self.decode_servers)
24
24
 
25
- async def generate_request(self, request_data):
26
- prefill_server, decode_server = self.select_pair()
25
+ async def generate(
26
+ self, modified_request, prefill_server, decode_server
27
+ ) -> ORJSONResponse:
27
28
 
28
- # Parse and transform prefill_server
29
- parsed_url = urllib.parse.urlparse(prefill_server)
30
- hostname = parsed_url.hostname
31
- bootstrap_host = f"{hostname}"
32
-
33
- modified_request = request_data.copy()
34
- modified_request.update(
35
- {
36
- "bootstrap_host": bootstrap_host,
37
- "bootstrap_room": random.randint(0, 2**63 - 1),
38
- }
39
- )
40
-
41
- async with aiohttp.ClientSession() as session:
42
- # Create the tasks
29
+ async with aiohttp.ClientSession(
30
+ timeout=aiohttp.ClientTimeout(
31
+ total=3600
32
+ ) # Add timeout for request reliability
33
+ ) as session:
43
34
  tasks = [
44
35
  session.post(f"{prefill_server}/generate", json=modified_request),
45
36
  session.post(f"{decode_server}/generate", json=modified_request),
46
37
  ]
38
+ # Wait for both responses to complete. Prefill should end first.
39
+ prefill_response, decode_response = await asyncio.gather(*tasks)
40
+
41
+ return ORJSONResponse(
42
+ content=await decode_response.json(),
43
+ status_code=decode_response.status,
44
+ )
45
+
46
+ async def generate_stream(self, modified_request, prefill_server, decode_server):
47
+ async def stream_results():
48
+ async with aiohttp.ClientSession(
49
+ timeout=aiohttp.ClientTimeout(
50
+ total=3600
51
+ ) # Add timeout for request reliability
52
+ ) as session:
53
+ try:
54
+ # Create the tasks for both prefill and decode requests
55
+ tasks = [
56
+ session.post(
57
+ f"{prefill_server}/generate", json=modified_request
58
+ ),
59
+ session.post(
60
+ f"{decode_server}/generate", json=modified_request
61
+ ),
62
+ ]
63
+ # Wait for both responses to complete. Since this is streaming, they return immediately.
64
+ prefill_response, decode_response = await asyncio.gather(*tasks)
65
+ async for chunk in decode_response.content:
66
+ yield chunk
67
+ except Exception as e:
68
+ error_msg = {
69
+ "error": {"message": f"Stream processing error: {str(e)}"}
70
+ }
71
+ yield b"data: " + orjson.dumps(
72
+ error_msg, option=orjson.OPT_NON_STR_KEYS
73
+ ) + b"\n\n"
74
+ finally:
75
+ if prefill_response is not None:
76
+ await prefill_response.release()
47
77
 
48
- prefill_response = None
49
- decode_response = None
50
-
51
- # Process responses as they arrive
52
- for i, response in enumerate(asyncio.as_completed(tasks)):
53
- response = await response
54
- # Check if this is the prefill or decode response based on order created
55
- if i == 0: # First completed task
56
- if str(response.url).startswith(prefill_server):
57
- prefill_response = response
58
- if response.status != 200:
59
- raise HTTPException(
60
- status_code=response.status,
61
- detail=f"Prefill server error: Status {response.status} Details: {await response.text()}",
62
- )
63
- else:
64
- decode_response = response
65
- if response.status != 200:
66
- raise HTTPException(
67
- status_code=response.status,
68
- detail=f"Decode server error: Status {response.status} Details: {await response.text()}",
69
- )
70
- else: # Second completed task
71
- if str(response.url).startswith(prefill_server):
72
- prefill_response = response
73
- else:
74
- decode_response = response
75
-
76
- if response.status != 200:
77
- raise HTTPException(
78
- status_code=response.status,
79
- detail=f"{'Prefill' if str(response.url).startswith(prefill_server) else 'Decode'} server error: Status {response.status} Details: {await response.text()}",
80
- )
81
-
82
- return await decode_response.json()
78
+ return StreamingResponse(
79
+ stream_results(),
80
+ media_type="text/event-stream",
81
+ )
83
82
 
84
83
 
85
84
  app = FastAPI()
@@ -169,78 +168,14 @@ async def handle_generate_request(request_data: dict):
169
168
  }
170
169
  )
171
170
 
172
- # Check if streaming is requested
173
171
  if request_data.get("stream", False):
174
-
175
- async def stream_results():
176
- async with aiohttp.ClientSession(
177
- timeout=aiohttp.ClientTimeout(total=3600)
178
- ) as session:
179
- try:
180
- # Create the tasks
181
- tasks = [
182
- session.post(
183
- f"{prefill_server}/generate", json=modified_request
184
- ),
185
- session.post(
186
- f"{decode_server}/generate", json=modified_request
187
- ),
188
- ]
189
-
190
- prefill_response = None
191
- decode_response = None
192
-
193
- # Process responses as they arrive
194
- for i, response_task in enumerate(asyncio.as_completed(tasks)):
195
- response = await response_task
196
-
197
- # Check the response immediately
198
- if str(response.url).startswith(prefill_server):
199
- prefill_response = response
200
- if response.status != 200:
201
- error_msg = {
202
- "error": {
203
- "message": f"Prefill server error: Status {response.status}, Details: {await response.text()}"
204
- }
205
- }
206
- yield b"data: " + orjson.dumps(
207
- error_msg, option=orjson.OPT_NON_STR_KEYS
208
- ) + b"\n\n"
209
- return
210
- else:
211
- decode_response = response
212
- if response.status != 200:
213
- error_msg = {
214
- "error": {
215
- "message": f"Decode server error: Status {response.status}"
216
- }
217
- }
218
- yield b"data: " + orjson.dumps(
219
- error_msg, option=orjson.OPT_NON_STR_KEYS
220
- ) + b"\n\n"
221
- return
222
-
223
- # Stream successful decode server response
224
- async for line in decode_response.content:
225
- yield line
226
- yield b"data: [DONE]\n\n"
227
-
228
- except Exception as e:
229
- error_msg = {
230
- "error": {"message": f"Stream processing error: {str(e)}"}
231
- }
232
- yield b"data: " + orjson.dumps(
233
- error_msg, option=orjson.OPT_NON_STR_KEYS
234
- ) + b"\n\n"
235
-
236
- return StreamingResponse(
237
- stream_results(),
238
- media_type="text/event-stream",
172
+ return await load_balancer.generate_stream(
173
+ modified_request, prefill_server, decode_server
174
+ )
175
+ else:
176
+ return await load_balancer.generate(
177
+ modified_request, prefill_server, decode_server
239
178
  )
240
-
241
- # Non-streaming case
242
- result = await load_balancer.generate_request(request_data)
243
- return ORJSONResponse(content=result)
244
179
 
245
180
 
246
181
  @app.get("/v1/models")
@@ -0,0 +1,6 @@
1
+ from .conn import (
2
+ MooncakeKVBootstrapServer,
3
+ MooncakeKVManager,
4
+ MooncakeKVReceiver,
5
+ MooncakeKVSender,
6
+ )